socktpi.c revision 2691240c021e7fd636dd0e4b884a2f76b0cb94d9
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/cred.h>
#include <sys/kmem.h>
#include <sys/kmem_impl.h>
#include <sys/sysmacros.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/debug.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/file.h>
#include <sys/open.h>
#include <sys/user.h>
#include <sys/termios.h>
#include <sys/stream.h>
#include <sys/strsubr.h>
#include <sys/strsun.h>
#include <sys/suntpi.h>
#include <sys/ddi.h>
#include <sys/esunddi.h>
#include <sys/flock.h>
#include <sys/modctl.h>
#include <sys/vtrace.h>
#include <sys/cmn_err.h>
#include <sys/pathname.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/sodirect.h>
#include <netinet/in.h>
#include <sys/un.h>
#include <sys/strsun.h>
#include <sys/tiuser.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/timod.h> /* TI_GETMYNAME, TI_GETPEERNAME */
#include <c2/audit.h>
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/tcp.h>
#include <inet/udp_impl.h>
#include <sys/zone.h>
#include <fs/sockfs/nl7c.h>
#include <fs/sockfs/nl7curi.h>
#include <inet/kssl/ksslapi.h>
#include <fs/sockfs/sockcommon.h>
#include <fs/sockfs/socktpi.h>
#include <fs/sockfs/socktpi_impl.h>
/*
* Possible failures when memory can't be allocated. The documented behavior:
*
* 5.5: 4.X: XNET:
* accept: ENOMEM/ENOSR/EINTR - (EINTR) ENOMEM/ENOBUFS/ENOSR/
* EINTR
* (4.X does not document EINTR but returns it)
* bind: ENOSR - ENOBUFS/ENOSR
* connect: EINTR EINTR ENOBUFS/ENOSR/EINTR
* getpeername: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
* getsockname: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
* (4.X getpeername and getsockname do not fail in practice)
* getsockopt: ENOMEM/ENOSR - ENOBUFS/ENOSR
* listen: - - ENOBUFS
* recv: ENOMEM/ENOSR/EINTR EINTR ENOBUFS/ENOMEM/ENOSR/
* EINTR
* send: ENOMEM/ENOSR/EINTR ENOBUFS/EINTR ENOBUFS/ENOMEM/ENOSR/
* EINTR
* setsockopt: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
* shutdown: ENOMEM/ENOSR - ENOBUFS/ENOSR
* socket: ENOMEM/ENOSR ENOBUFS ENOBUFS/ENOMEM/ENOSR
* socketpair: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
*
* Resolution. When allocation fails:
* recv: return EINTR
* send: return EINTR
* connect, accept: EINTR
* bind, listen, shutdown (unbind, unix_close, disconnect): sleep
* socket, socketpair: ENOBUFS
* getpeername, getsockname: sleep
* getsockopt, setsockopt: sleep
*/
#ifdef SOCK_TEST
/*
* Variables that make sockfs do something other than the standard TPI
* for the AF_INET transports.
*
* solisten_tpi_tcp:
* TCP can handle a O_T_BIND_REQ with an increased backlog even though
* the transport is already bound. This is needed to avoid loosing the
* port number should listen() do a T_UNBIND_REQ followed by a
* O_T_BIND_REQ.
*
* soconnect_tpi_udp:
* UDP and ICMP can handle a T_CONN_REQ.
* This is needed to make the sequence of connect(), getsockname()
* return the local IP address used to send packets to the connected to
* destination.
*
* soconnect_tpi_tcp:
* TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
* Set this to non-zero to send TPI conformant messages to TCP in this
* respect. This is a performance optimization.
*
* soaccept_tpi_tcp:
* TCP can handle a T_CONN_REQ without the acceptor being bound.
* This is a performance optimization that has been picked up in XTI.
*
* soaccept_tpi_multioptions:
* When inheriting SOL_SOCKET options from the listener to the accepting
* socket send them as a single message for AF_INET{,6}.
*/
int solisten_tpi_tcp = 0;
int soconnect_tpi_udp = 0;
int soconnect_tpi_tcp = 0;
int soaccept_tpi_tcp = 0;
int soaccept_tpi_multioptions = 1;
#else /* SOCK_TEST */
#define soconnect_tpi_tcp 0
#define soconnect_tpi_udp 0
#define solisten_tpi_tcp 0
#define soaccept_tpi_tcp 0
#define soaccept_tpi_multioptions 1
#endif /* SOCK_TEST */
#ifdef SOCK_TEST
extern int do_useracc;
extern clock_t sock_test_timelimit;
#endif /* SOCK_TEST */
/*
* Some X/Open added checks might have to be backed out to keep SunOS 4.X
* applications working. Turn on this flag to disable these checks.
*/
int xnet_skip_checks = 0;
int xnet_check_print = 0;
int xnet_truncate_print = 0;
static void sotpi_destroy(struct sonode *);
static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
int, int *, cred_t *cr);
static boolean_t sotpi_info_create(struct sonode *, int);
static void sotpi_info_init(struct sonode *);
static void sotpi_info_fini(struct sonode *);
static void sotpi_info_destroy(struct sonode *);
/*
* Do direct function call to the transport layer below; this would
* also allow the transport to utilize read-side synchronous stream
* interface if necessary. This is a /etc/system tunable that must
* not be modified on a running system. By default this is enabled
* for performance reasons and may be disabled for debugging purposes.
*/
boolean_t socktpi_direct = B_TRUE;
static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
extern void sigintr(k_sigset_t *, int);
extern void sigunintr(k_sigset_t *);
/* Sockets acting as an in-kernel SSL proxy */
extern mblk_t *strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
strsigset_t *, strsigset_t *, strpollset_t *);
extern mblk_t *strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
strsigset_t *, strsigset_t *, strpollset_t *);
static int sotpi_unbind(struct sonode *, int);
extern int sodput(sodirect_t *, mblk_t *);
extern void sodwakeup(sodirect_t *);
/* TPI sockfs sonode operations */
int sotpi_init(struct sonode *, struct sonode *, struct cred *,
int);
static int sotpi_accept(struct sonode *, int, struct cred *,
struct sonode **);
static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
int, struct cred *);
static int sotpi_listen(struct sonode *, int, struct cred *);
static int sotpi_connect(struct sonode *, const struct sockaddr *,
socklen_t, int, int, struct cred *);
extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *,
struct uio *, struct cred *);
static int sotpi_sendmsg(struct sonode *, struct nmsghdr *,
struct uio *, struct cred *);
static int sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
struct cred *, mblk_t **);
static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
struct uio *, void *, t_uscalar_t, int);
static int sodgram_direct(struct sonode *, struct sockaddr *,
socklen_t, struct uio *, int);
extern int sotpi_getpeername(struct sonode *, struct sockaddr *,
socklen_t *, boolean_t, struct cred *);
static int sotpi_getsockname(struct sonode *, struct sockaddr *,
socklen_t *, struct cred *);
static int sotpi_shutdown(struct sonode *, int, struct cred *);
extern int sotpi_getsockopt(struct sonode *, int, int, void *,
socklen_t *, int, struct cred *);
extern int sotpi_setsockopt(struct sonode *, int, int, const void *,
socklen_t, struct cred *);
static int sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
int32_t *);
static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
struct cred *, int32_t *);
static int sotpi_poll(struct sonode *, short, int, short *,
struct pollhead **);
static int sotpi_close(struct sonode *, int, struct cred *);
static int i_sotpi_info_constructor(sotpi_info_t *);
static void i_sotpi_info_destructor(sotpi_info_t *);
sonodeops_t sotpi_sonodeops = {
sotpi_init, /* sop_init */
sotpi_accept, /* sop_accept */
sotpi_bind, /* sop_bind */
sotpi_listen, /* sop_listen */
sotpi_connect, /* sop_connect */
sotpi_recvmsg, /* sop_recvmsg */
sotpi_sendmsg, /* sop_sendmsg */
sotpi_sendmblk, /* sop_sendmblk */
sotpi_getpeername, /* sop_getpeername */
sotpi_getsockname, /* sop_getsockname */
sotpi_shutdown, /* sop_shutdown */
sotpi_getsockopt, /* sop_getsockopt */
sotpi_setsockopt, /* sop_setsockopt */
sotpi_ioctl, /* sop_ioctl */
sotpi_poll, /* sop_poll */
sotpi_close, /* sop_close */
};
/*
* Return a TPI socket vnode.
*
* Note that sockets assume that the driver will clone (either itself
* or by using the clone driver) i.e. a socket() call will always
* result in a new vnode being created.
*/
/*
* Common create code for socket and accept. If tso is set the values
* from that node is used instead of issuing a T_INFO_REQ.
*/
/* ARGSUSED */
static struct sonode *
sotpi_create(struct sockparams *sp, int family, int type, int protocol,
int version, int sflags, int *errorp, cred_t *cr)
{
struct sonode *so;
kmem_cache_t *cp;
int sfamily = family;
ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
if (family == AF_NCA) {
/*
* The request is for an NCA socket so for NL7C use the
* INET domain instead and mark NL7C_AF_NCA below.
*/
family = AF_INET;
/*
* NL7C is not supported in the non-global zone,
* we enforce this restriction here.
*/
if (getzoneid() != GLOBAL_ZONEID) {
*errorp = ENOTSUP;
return (NULL);
}
}
/*
* to be compatible with old tpi socket implementation ignore
* sleep flag (sflags) passed in
*/
cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
so = kmem_cache_alloc(cp, KM_SLEEP);
if (so == NULL) {
*errorp = ENOMEM;
return (NULL);
}
sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
sotpi_info_init(so);
if (sfamily == AF_NCA) {
SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
}
if (version == SOV_DEFAULT)
version = so_default_version;
so->so_version = (short)version;
*errorp = 0;
return (so);
}
static void
sotpi_destroy(struct sonode *so)
{
kmem_cache_t *cp;
struct sockparams *origsp;
/*
* If there is a new dealloc function (ie. smod_destroy_func),
* then it should check the correctness of the ops.
*/
ASSERT(so->so_ops == &sotpi_sonodeops);
origsp = SOTOTPI(so)->sti_orig_sp;
sotpi_info_fini(so);
if (so->so_state & SS_FALLBACK_COMP) {
/*
* A fallback happend, which means that a sotpi_info_t struct
* was allocated (as opposed to being allocated from the TPI
* sonode cache. Therefore we explicitly free the struct
* here.
*/
sotpi_info_destroy(so);
ASSERT(origsp != NULL);
origsp->sp_smod_info->smod_sock_destroy_func(so);
SOCKPARAMS_DEC_REF(origsp);
} else {
sonode_fini(so);
cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
socktpi_cache;
kmem_cache_free(cp, so);
}
}
/* ARGSUSED1 */
int
sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
{
major_t maj;
dev_t newdev;
struct vnode *vp;
int error = 0;
struct stdata *stp;
sotpi_info_t *sti = SOTOTPI(so);
dprint(1, ("sotpi_init()\n"));
/*
* over write the sleep flag passed in but that is ok
* as tpi socket does not honor sleep flag.
*/
flags |= FREAD|FWRITE;
/*
* Record in so_flag that it is a clone.
*/
if (getmajor(sti->sti_dev) == clone_major)
so->so_flag |= SOCLONE;
if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
(so->so_family == AF_INET || so->so_family == AF_INET6) &&
(so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
so->so_protocol == IPPROTO_IP)) {
/* Tell tcp or udp that it's talking to sockets */
flags |= SO_SOCKSTR;
/*
* Here we indicate to socktpi_open() our attempt to
* make direct calls between sockfs and transport.
* The final decision is left to socktpi_open().
*/
sti->sti_direct = 1;
ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
if (so->so_type == SOCK_STREAM && tso != NULL) {
if (SOTOTPI(tso)->sti_direct) {
/*
* Inherit sti_direct from listener and pass
* SO_ACCEPTOR open flag to tcp, indicating
* that this is an accept fast-path instance.
*/
flags |= SO_ACCEPTOR;
} else {
/*
* sti_direct is not set on listener, meaning
* that the listener has been converted from
* a socket to a stream. Ensure that the
* acceptor inherits these settings.
*/
sti->sti_direct = 0;
flags &= ~SO_SOCKSTR;
}
}
}
/*
* Tell local transport that it is talking to sockets.
*/
if (so->so_family == AF_UNIX) {
flags |= SO_SOCKSTR;
}
vp = SOTOV(so);
newdev = vp->v_rdev;
maj = getmajor(newdev);
ASSERT(STREAMSTAB(maj));
error = stropen(vp, &newdev, flags, cr);
stp = vp->v_stream;
if (error == 0) {
if (so->so_flag & SOCLONE)
ASSERT(newdev != vp->v_rdev);
mutex_enter(&so->so_lock);
sti->sti_dev = newdev;
vp->v_rdev = newdev;
mutex_exit(&so->so_lock);
if (stp->sd_flag & STRISTTY) {
/*
* this is a post SVR4 tty driver - a socket can not
* be a controlling terminal. Fail the open.
*/
(void) sotpi_close(so, flags, cr);
return (ENOTTY); /* XXX */
}
ASSERT(stp->sd_wrq != NULL);
sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
/*
* If caller is interested in doing direct function call
* interface to/from transport module, probe the module
* directly beneath the streamhead to see if it qualifies.
*
* We turn off the direct interface when qualifications fail.
* In the acceptor case, we simply turn off the sti_direct
* flag on the socket. We do the fallback after the accept
* has completed, before the new socket is returned to the
* application.
*/
if (sti->sti_direct) {
queue_t *tq = stp->sd_wrq->q_next;
/*
* sti_direct is currently supported and tested
* only for tcp/udp; this is the main reason to
* have the following assertions.
*/
ASSERT(so->so_family == AF_INET ||
so->so_family == AF_INET6);
ASSERT(so->so_protocol == IPPROTO_UDP ||
so->so_protocol == IPPROTO_TCP ||
so->so_protocol == IPPROTO_IP);
ASSERT(so->so_type == SOCK_DGRAM ||
so->so_type == SOCK_STREAM);
/*
* Abort direct call interface if the module directly
* underneath the stream head is not defined with the
* _D_DIRECT flag. This could happen in the tcp or
* udp case, when some other module is autopushed
* above it, or for some reasons the expected module
* isn't purely D_MP (which is the main requirement).
*
* Else, SS_DIRECT is valid. If the read-side Q has
* _QSODIRECT set then and uioasync is enabled then
* set SS_SODIRECT to enable sodirect.
*/
if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
!(_OTHERQ(tq)->q_flag & _QDIRECT)) {
int rval;
/* Continue on without direct calls */
sti->sti_direct = 0;
/*
* Cannot issue ioctl on fallback socket since
* there is no conn associated with the queue.
* The fallback downcall will notify the proto
* of the change.
*/
if (!(flags & SO_ACCEPTOR) &&
!(flags & SO_FALLBACK)) {
if ((error = strioctl(vp,
_SIOCSOCKFALLBACK, 0, 0, K_TO_K,
cr, &rval)) != 0) {
(void) sotpi_close(so, flags,
cr);
return (error);
}
}
} else if ((_OTHERQ(tq)->q_flag & _QSODIRECT) &&
uioasync.enabled) {
/* Enable sodirect */
so->so_state |= SS_SODIRECT;
}
}
if (flags & SO_FALLBACK) {
/*
* The stream created does not have a conn.
* do stream set up after conn has been assigned
*/
return (error);
}
if (error = so_strinit(so, tso)) {
(void) sotpi_close(so, flags, cr);
return (error);
}
/* Wildcard */
if (so->so_protocol != so->so_sockparams->sp_protocol) {
int protocol = so->so_protocol;
/*
* Issue SO_PROTOTYPE setsockopt.
*/
error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
&protocol, (t_uscalar_t)sizeof (protocol), cr);
if (error != 0) {
(void) sotpi_close(so, flags, cr);
/*
* Setsockopt often fails with ENOPROTOOPT but
* socket() should fail with
* EPROTONOSUPPORT/EPROTOTYPE.
*/
return (EPROTONOSUPPORT);
}
}
} else {
/*
* While the same socket can not be reopened (unlike specfs)
* the stream head sets STREOPENFAIL when the autopush fails.
*/
if ((stp != NULL) &&
(stp->sd_flag & STREOPENFAIL)) {
/*
* Open failed part way through.
*/
mutex_enter(&stp->sd_lock);
stp->sd_flag &= ~STREOPENFAIL;
mutex_exit(&stp->sd_lock);
(void) sotpi_close(so, flags, cr);
return (error);
/*NOTREACHED*/
}
ASSERT(stp == NULL);
}
TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
"sockfs open:maj %d vp %p so %p error %d",
maj, vp, so, error);
return (error);
}
/*
* Bind the socket to an unspecified address in sockfs only.
* Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
* required in all cases.
*/
static void
so_automatic_bind(struct sonode *so)
{
sotpi_info_t *sti = SOTOTPI(so);
ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(!(so->so_state & SS_ISBOUND));
ASSERT(sti->sti_unbind_mp);
ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
sti->sti_laddr_sa->sa_family = so->so_family;
so->so_state |= SS_ISBOUND;
}
/*
* bind the socket.
*
* If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
* are passed in we allow rebinding. Note that for backwards compatibility
* even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
* Thus the rebinding code is currently not executed.
*
* The constraints for rebinding are:
* - it is a SOCK_DGRAM, or
* - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
* and no listen() has been done.
* This rebinding code was added based on some language in the XNET book
* about not returning EINVAL it the protocol allows rebinding. However,
* this language is not present in the Posix socket draft. Thus maybe the
* rebinding logic should be deleted from the source.
*
* A null "name" can be used to unbind the socket if:
* - it is a SOCK_DGRAM, or
* - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
* and no listen() has been done.
*/
/* ARGSUSED */
static int
sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
socklen_t namelen, int backlog, int flags, struct cred *cr)
{
struct T_bind_req bind_req;
struct T_bind_ack *bind_ack;
int error = 0;
mblk_t *mp;
void *addr;
t_uscalar_t addrlen;
int unbind_on_err = 1;
boolean_t clear_acceptconn_on_err = B_FALSE;
boolean_t restore_backlog_on_err = B_FALSE;
int save_so_backlog;
t_scalar_t PRIM_type = O_T_BIND_REQ;
boolean_t tcp_udp_xport;
void *nl7c = NULL;
sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
(void *)so, (void *)name, namelen, backlog, flags,
pr_state(so->so_state, so->so_mode)));
tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
if (!(flags & _SOBIND_LOCK_HELD)) {
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
} else {
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so->so_flag & SOLOCKED);
}
/*
* Make sure that there is a preallocated unbind_req message
* before binding. This message allocated when the socket is
* created but it might be have been consumed.
*/
if (sti->sti_unbind_mp == NULL) {
dprintso(so, 1, ("sobind: allocating unbind_req\n"));
/* NOTE: holding so_lock while sleeping */
sti->sti_unbind_mp =
soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
}
if (flags & _SOBIND_REBIND) {
/*
* Called from solisten after doing an sotpi_unbind() or
* potentially without the unbind (latter for AF_INET{,6}).
*/
ASSERT(name == NULL && namelen == 0);
if (so->so_family == AF_UNIX) {
ASSERT(sti->sti_ux_bound_vp);
addr = &sti->sti_ux_laddr;
addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
"addr 0x%p, vp %p\n",
addrlen,
(void *)((struct so_ux_addr *)addr)->soua_vp,
(void *)sti->sti_ux_bound_vp));
} else {
addr = sti->sti_laddr_sa;
addrlen = (t_uscalar_t)sti->sti_laddr_len;
}
} else if (flags & _SOBIND_UNSPEC) {
ASSERT(name == NULL && namelen == 0);
/*
* The caller checked SS_ISBOUND but not necessarily
* under so_lock
*/
if (so->so_state & SS_ISBOUND) {
/* No error */
goto done;
}
/* Set an initial local address */
switch (so->so_family) {
case AF_UNIX:
/*
* Use an address with same size as struct sockaddr
* just like BSD.
*/
sti->sti_laddr_len =
(socklen_t)sizeof (struct sockaddr);
ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
sti->sti_laddr_sa->sa_family = so->so_family;
/*
* Pass down an address with the implicit bind
* magic number and the rest all zeros.
* The transport will return a unique address.
*/
sti->sti_ux_laddr.soua_vp = NULL;
sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
addr = &sti->sti_ux_laddr;
addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
break;
case AF_INET:
case AF_INET6:
/*
* An unspecified bind in TPI has a NULL address.
* Set the address in sockfs to have the sa_family.
*/
sti->sti_laddr_len = (so->so_family == AF_INET) ?
(socklen_t)sizeof (sin_t) :
(socklen_t)sizeof (sin6_t);
ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
sti->sti_laddr_sa->sa_family = so->so_family;
addr = NULL;
addrlen = 0;
break;
default:
/*
* An unspecified bind in TPI has a NULL address.
* Set the address in sockfs to be zero length.
*
* Can not assume there is a sa_family for all
* protocol families. For example, AF_X25 does not
* have a family field.
*/
bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
sti->sti_laddr_len = 0; /* XXX correct? */
addr = NULL;
addrlen = 0;
break;
}
} else {
if (so->so_state & SS_ISBOUND) {
/*
* If it is ok to rebind the socket, first unbind
* with the transport. A rebind to the NULL address
* is interpreted as an unbind.
* Note that a bind to NULL in BSD does unbind the
* socket but it fails with EINVAL.
* Note that regular sockets set SOV_SOCKBSD i.e.
* _SOBIND_SOCKBSD gets set here hence no type of
* socket does currently allow rebinding.
*
* If the name is NULL just do an unbind.
*/
if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
name != NULL) {
error = EINVAL;
unbind_on_err = 0;
eprintsoline(so, error);
goto done;
}
if ((so->so_mode & SM_CONNREQUIRED) &&
(so->so_state & SS_CANTREBIND)) {
error = EINVAL;
unbind_on_err = 0;
eprintsoline(so, error);
goto done;
}
error = sotpi_unbind(so, 0);
if (error) {
eprintsoline(so, error);
goto done;
}
ASSERT(!(so->so_state & SS_ISBOUND));
if (name == NULL) {
so->so_state &=
~(SS_ISCONNECTED|SS_ISCONNECTING);
goto done;
}
}
/* X/Open requires this check */
if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
if (xnet_check_print) {
printf("sockfs: X/Open bind state check "
"caused EINVAL\n");
}
error = EINVAL;
goto done;
}
switch (so->so_family) {
case AF_UNIX:
/*
* All AF_UNIX addresses are nul terminated
* when copied (copyin_name) in so the minimum
* length is 3 bytes.
*/
if (name == NULL ||
(ssize_t)namelen <= sizeof (short) + 1) {
error = EISDIR;
eprintsoline(so, error);
goto done;
}
/*
* Verify so_family matches the bound family.
* BSD does not check this for AF_UNIX resulting
* in funny mknods.
*/
if (name->sa_family != so->so_family) {
error = EAFNOSUPPORT;
goto done;
}
break;
case AF_INET:
if (name == NULL) {
error = EINVAL;
eprintsoline(so, error);
goto done;
}
if ((size_t)namelen != sizeof (sin_t)) {
error = name->sa_family != so->so_family ?
EAFNOSUPPORT : EINVAL;
eprintsoline(so, error);
goto done;
}
if ((flags & _SOBIND_XPG4_2) &&
(name->sa_family != so->so_family)) {
/*
* This check has to be made for X/Open
* sockets however application failures have
* been observed when it is applied to
* all sockets.
*/
error = EAFNOSUPPORT;
eprintsoline(so, error);
goto done;
}
/*
* Force a zero sa_family to match so_family.
*
* Some programs like inetd(1M) don't set the
* family field. Other programs leave
* sin_family set to garbage - SunOS 4.X does
* not check the family field on a bind.
* We use the family field that
* was passed in to the socket() call.
*/
name->sa_family = so->so_family;
break;
case AF_INET6: {
#ifdef DEBUG
sin6_t *sin6 = (sin6_t *)name;
#endif /* DEBUG */
if (name == NULL) {
error = EINVAL;
eprintsoline(so, error);
goto done;
}
if ((size_t)namelen != sizeof (sin6_t)) {
error = name->sa_family != so->so_family ?
EAFNOSUPPORT : EINVAL;
eprintsoline(so, error);
goto done;
}
if (name->sa_family != so->so_family) {
/*
* With IPv6 we require the family to match
* unlike in IPv4.
*/
error = EAFNOSUPPORT;
eprintsoline(so, error);
goto done;
}
#ifdef DEBUG
/*
* Verify that apps don't forget to clear
* sin6_scope_id etc
*/
if (sin6->sin6_scope_id != 0 &&
!IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
zcmn_err(getzoneid(), CE_WARN,
"bind with uninitialized sin6_scope_id "
"(%d) on socket. Pid = %d\n",
(int)sin6->sin6_scope_id,
(int)curproc->p_pid);
}
if (sin6->__sin6_src_id != 0) {
zcmn_err(getzoneid(), CE_WARN,
"bind with uninitialized __sin6_src_id "
"(%d) on socket. Pid = %d\n",
(int)sin6->__sin6_src_id,
(int)curproc->p_pid);
}
#endif /* DEBUG */
break;
}
default:
/*
* Don't do any length or sa_family check to allow
* non-sockaddr style addresses.
*/
if (name == NULL) {
error = EINVAL;
eprintsoline(so, error);
goto done;
}
break;
}
if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
error = ENAMETOOLONG;
eprintsoline(so, error);
goto done;
}
/*
* Save local address.
*/
sti->sti_laddr_len = (socklen_t)namelen;
ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
bcopy(name, sti->sti_laddr_sa, namelen);
addr = sti->sti_laddr_sa;
addrlen = (t_uscalar_t)sti->sti_laddr_len;
switch (so->so_family) {
case AF_INET6:
case AF_INET:
break;
case AF_UNIX: {
struct sockaddr_un *soun =
(struct sockaddr_un *)sti->sti_laddr_sa;
struct vnode *vp, *rvp;
struct vattr vattr;
ASSERT(sti->sti_ux_bound_vp == NULL);
/*
* Create vnode for the specified path name.
* Keep vnode held with a reference in sti_ux_bound_vp.
* Use the vnode pointer as the address used in the
* bind with the transport.
*
* Use the same mode as in BSD. In particular this does
* not observe the umask.
*/
/* MAXPATHLEN + soun_family + nul termination */
if (sti->sti_laddr_len >
(socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
error = ENAMETOOLONG;
eprintsoline(so, error);
goto done;
}
vattr.va_type = VSOCK;
vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
vattr.va_mask = AT_TYPE|AT_MODE;
/* NOTE: holding so_lock */
error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
EXCL, 0, &vp, CRMKNOD, 0, 0);
if (error) {
if (error == EEXIST)
error = EADDRINUSE;
eprintsoline(so, error);
goto done;
}
/*
* Establish pointer from the underlying filesystem
* vnode to the socket node.
* sti_ux_bound_vp and v_stream->sd_vnode form the
* cross-linkage between the underlying filesystem
* node and the socket node.
*/
if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
VN_HOLD(rvp);
VN_RELE(vp);
vp = rvp;
}
ASSERT(SOTOV(so)->v_stream);
mutex_enter(&vp->v_lock);
vp->v_stream = SOTOV(so)->v_stream;
sti->sti_ux_bound_vp = vp;
mutex_exit(&vp->v_lock);
/*
* Use the vnode pointer value as a unique address
* (together with the magic number to avoid conflicts
* with implicit binds) in the transport provider.
*/
sti->sti_ux_laddr.soua_vp =
(void *)sti->sti_ux_bound_vp;
sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
addr = &sti->sti_ux_laddr;
addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
addrlen,
(void *)((struct so_ux_addr *)addr)->soua_vp));
break;
}
} /* end switch (so->so_family) */
}
/*
* set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
* the transport can start passing up T_CONN_IND messages
* as soon as it receives the bind req and strsock_proto()
* insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
*/
if (flags & _SOBIND_LISTEN) {
if ((so->so_state & SS_ACCEPTCONN) == 0)
clear_acceptconn_on_err = B_TRUE;
save_so_backlog = so->so_backlog;
restore_backlog_on_err = B_TRUE;
so->so_state |= SS_ACCEPTCONN;
so->so_backlog = backlog;
}
/*
* If NL7C addr(s) have been configured check for addr/port match,
* or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
*
* NL7C supports the TCP transport only so check AF_INET and AF_INET6
* family sockets only. If match mark as such.
*/
if (nl7c_enabled && ((addr != NULL &&
(so->so_family == AF_INET || so->so_family == AF_INET6) &&
(nl7c = nl7c_lookup_addr(addr, addrlen))) ||
sti->sti_nl7c_flags == NL7C_AF_NCA)) {
/*
* NL7C is not supported in non-global zones,
* we enforce this restriction here.
*/
if (so->so_zoneid == GLOBAL_ZONEID) {
/* An NL7C socket, mark it */
sti->sti_nl7c_flags |= NL7C_ENABLED;
if (nl7c == NULL) {
/*
* Was an AF_NCA bind() so add it to the
* addr list for reporting purposes.
*/
nl7c = nl7c_add_addr(addr, addrlen);
}
} else
nl7c = NULL;
}
/*
* We send a T_BIND_REQ for TCP/UDP since we know it supports it,
* for other transports we will send in a O_T_BIND_REQ.
*/
if (tcp_udp_xport &&
(so->so_family == AF_INET || so->so_family == AF_INET6))
PRIM_type = T_BIND_REQ;
bind_req.PRIM_type = PRIM_type;
bind_req.ADDR_length = addrlen;
bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
bind_req.CONIND_number = backlog;
/* NOTE: holding so_lock while sleeping */
mp = soallocproto2(&bind_req, sizeof (bind_req),
addr, addrlen, 0, _ALLOC_SLEEP);
sti->sti_laddr_valid = 0;
/* Done using sti_laddr_sa - can drop the lock */
mutex_exit(&so->so_lock);
/*
* Intercept the bind_req message here to check if this <address/port>
* was configured as an SSL proxy server, or if another endpoint was
* already configured to act as a proxy for us.
*
* Note, only if NL7C not enabled for this socket.
*/
if (nl7c == NULL &&
(so->so_family == AF_INET || so->so_family == AF_INET6) &&
so->so_type == SOCK_STREAM) {
if (sti->sti_kssl_ent != NULL) {
kssl_release_ent(sti->sti_kssl_ent, so,
sti->sti_kssl_type);
sti->sti_kssl_ent = NULL;
}
sti->sti_kssl_type = kssl_check_proxy(mp, so,
&sti->sti_kssl_ent);
switch (sti->sti_kssl_type) {
case KSSL_NO_PROXY:
break;
case KSSL_HAS_PROXY:
mutex_enter(&so->so_lock);
goto skip_transport;
case KSSL_IS_PROXY:
break;
}
}
error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
if (error) {
eprintsoline(so, error);
mutex_enter(&so->so_lock);
goto done;
}
mutex_enter(&so->so_lock);
error = sowaitprim(so, PRIM_type, T_BIND_ACK,
(t_uscalar_t)sizeof (*bind_ack), &mp, 0);
if (error) {
eprintsoline(so, error);
goto done;
}
skip_transport:
ASSERT(mp);
/*
* Even if some TPI message (e.g. T_DISCON_IND) was received in
* strsock_proto while the lock was dropped above, the bind
* is allowed to complete.
*/
/* Mark as bound. This will be undone if we detect errors below. */
if (flags & _SOBIND_NOXLATE) {
ASSERT(so->so_family == AF_UNIX);
sti->sti_faddr_noxlate = 1;
}
ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
so->so_state |= SS_ISBOUND;
ASSERT(sti->sti_unbind_mp);
/* note that we've already set SS_ACCEPTCONN above */
/*
* Recompute addrlen - an unspecied bind sent down an
* address of length zero but we expect the appropriate length
* in return.
*/
addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
bind_ack = (struct T_bind_ack *)mp->b_rptr;
/*
* The alignment restriction is really too strict but
* we want enough alignment to inspect the fields of
* a sockaddr_in.
*/
addr = sogetoff(mp, bind_ack->ADDR_offset,
bind_ack->ADDR_length,
__TPI_ALIGN_SIZE);
if (addr == NULL) {
freemsg(mp);
error = EPROTO;
eprintsoline(so, error);
goto done;
}
if (!(flags & _SOBIND_UNSPEC)) {
/*
* Verify that the transport didn't return something we
* did not want e.g. an address other than what we asked for.
*
* NOTE: These checks would go away if/when we switch to
* using the new TPI (in which the transport would fail
* the request instead of assigning a different address).
*
* NOTE2: For protocols that we don't know (i.e. any
* other than AF_INET6, AF_INET and AF_UNIX), we
* cannot know if the transport should be expected to
* return the same address as that requested.
*
* NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
* down a T_BIND_REQ. We use O_T_BIND_REQ for others.
*
* For example, in the case of netatalk it may be
* inappropriate for the transport to return the
* requested address (as it may have allocated a local
* port number in behaviour similar to that of an
* AF_INET bind request with a port number of zero).
*
* Given the definition of O_T_BIND_REQ, where the
* transport may bind to an address other than the
* requested address, it's not possible to determine
* whether a returned address that differs from the
* requested address is a reason to fail (because the
* requested address was not available) or succeed
* (because the transport allocated an appropriate
* address and/or port).
*
* sockfs currently requires that the transport return
* the requested address in the T_BIND_ACK, unless
* there is code here to allow for any discrepancy.
* Such code exists for AF_INET and AF_INET6.
*
* Netatalk chooses to return the requested address
* rather than the (correct) allocated address. This
* means that netatalk violates the TPI specification
* (and would not function correctly if used from a
* TLI application), but it does mean that it works
* with sockfs.
*
* As noted above, using the newer XTI bind primitive
* (T_BIND_REQ) in preference to O_T_BIND_REQ would
* allow sockfs to be more sure about whether or not
* the bind request had succeeded (as transports are
* not permitted to bind to a different address than
* that requested - they must return failure).
* Unfortunately, support for T_BIND_REQ may not be
* present in all transport implementations (netatalk,
* for example, doesn't have it), making the
* transition difficult.
*/
if (bind_ack->ADDR_length != addrlen) {
/* Assumes that the requested address was in use */
freemsg(mp);
error = EADDRINUSE;
eprintsoline(so, error);
goto done;
}
switch (so->so_family) {
case AF_INET6:
case AF_INET: {
sin_t *rname, *aname;
rname = (sin_t *)addr;
aname = (sin_t *)sti->sti_laddr_sa;
/*
* Take advantage of the alignment
* of sin_port and sin6_port which fall
* in the same place in their data structures.
* Just use sin_port for either address family.
*
* This may become a problem if (heaven forbid)
* there's a separate ipv6port_reserved... :-P
*
* Binding to port 0 has the semantics of letting
* the transport bind to any port.
*
* If the transport is TCP or UDP since we had sent
* a T_BIND_REQ we would not get a port other than
* what we asked for.
*/
if (tcp_udp_xport) {
/*
* Pick up the new port number if we bound to
* port 0.
*/
if (aname->sin_port == 0)
aname->sin_port = rname->sin_port;
sti->sti_laddr_valid = 1;
break;
}
if (aname->sin_port != 0 &&
aname->sin_port != rname->sin_port) {
freemsg(mp);
error = EADDRINUSE;
eprintsoline(so, error);
goto done;
}
/*
* Pick up the new port number if we bound to port 0.
*/
aname->sin_port = rname->sin_port;
/*
* Unfortunately, addresses aren't _quite_ the same.
*/
if (so->so_family == AF_INET) {
if (aname->sin_addr.s_addr !=
rname->sin_addr.s_addr) {
freemsg(mp);
error = EADDRNOTAVAIL;
eprintsoline(so, error);
goto done;
}
} else {
sin6_t *rname6 = (sin6_t *)rname;
sin6_t *aname6 = (sin6_t *)aname;
if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
&rname6->sin6_addr)) {
freemsg(mp);
error = EADDRNOTAVAIL;
eprintsoline(so, error);
goto done;
}
}
break;
}
case AF_UNIX:
if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
freemsg(mp);
error = EADDRINUSE;
eprintsoline(so, error);
eprintso(so,
("addrlen %d, addr 0x%x, vp %p\n",
addrlen, *((int *)addr),
(void *)sti->sti_ux_bound_vp));
goto done;
}
sti->sti_laddr_valid = 1;
break;
default:
/*
* NOTE: This assumes that addresses can be
* byte-compared for equivalence.
*/
if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
freemsg(mp);
error = EADDRINUSE;
eprintsoline(so, error);
goto done;
}
/*
* Don't mark sti_laddr_valid, as we cannot be
* sure that the returned address is the real
* bound address when talking to an unknown
* transport.
*/
break;
}
} else {
/*
* Save for returned address for getsockname.
* Needed for unspecific bind unless transport supports
* the TI_GETMYNAME ioctl.
* Do this for AF_INET{,6} even though they do, as
* caching info here is much better performance than
* a TPI/STREAMS trip to the transport for getsockname.
* Any which can't for some reason _must_ _not_ set
* sti_laddr_valid here for the caching version of
* getsockname to not break;
*/
switch (so->so_family) {
case AF_UNIX:
/*
* Record the address bound with the transport
* for use by socketpair.
*/
bcopy(addr, &sti->sti_ux_laddr, addrlen);
sti->sti_laddr_valid = 1;
break;
case AF_INET:
case AF_INET6:
ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
sti->sti_laddr_valid = 1;
break;
default:
/*
* Don't mark sti_laddr_valid, as we cannot be
* sure that the returned address is the real
* bound address when talking to an unknown
* transport.
*/
break;
}
}
if (nl7c != NULL) {
/* Register listen()er sonode pointer with NL7C */
nl7c_listener_addr(nl7c, so);
}
freemsg(mp);
done:
if (error) {
/* reset state & backlog to values held on entry */
if (clear_acceptconn_on_err == B_TRUE)
so->so_state &= ~SS_ACCEPTCONN;
if (restore_backlog_on_err == B_TRUE)
so->so_backlog = save_so_backlog;
if (unbind_on_err && so->so_state & SS_ISBOUND) {
int err;
err = sotpi_unbind(so, 0);
/* LINTED - statement has no consequent: if */
if (err) {
eprintsoline(so, error);
} else {
ASSERT(!(so->so_state & SS_ISBOUND));
}
}
}
if (!(flags & _SOBIND_LOCK_HELD)) {
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
} else {
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so->so_flag & SOLOCKED);
}
return (error);
}
/* bind the socket */
static int
sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
int flags, struct cred *cr)
{
if ((flags & _SOBIND_SOCKETPAIR) == 0)
return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
flags &= ~_SOBIND_SOCKETPAIR;
return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
}
/*
* Unbind a socket - used when bind() fails, when bind() specifies a NULL
* address, or when listen needs to unbind and bind.
* If the _SOUNBIND_REBIND flag is specified the addresses are retained
* so that a sobind can pick them up.
*/
static int
sotpi_unbind(struct sonode *so, int flags)
{
struct T_unbind_req unbind_req;
int error = 0;
mblk_t *mp;
sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
(void *)so, flags, pr_state(so->so_state, so->so_mode)));
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so->so_flag & SOLOCKED);
if (!(so->so_state & SS_ISBOUND)) {
error = EINVAL;
eprintsoline(so, error);
goto done;
}
mutex_exit(&so->so_lock);
/*
* Flush the read and write side (except stream head read queue)
* and send down T_UNBIND_REQ.
*/
(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
unbind_req.PRIM_type = T_UNBIND_REQ;
mp = soallocproto1(&unbind_req, sizeof (unbind_req),
0, _ALLOC_SLEEP);
error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
mutex_enter(&so->so_lock);
if (error) {
eprintsoline(so, error);
goto done;
}
error = sowaitokack(so, T_UNBIND_REQ);
if (error) {
eprintsoline(so, error);
goto done;
}
/*
* Even if some TPI message (e.g. T_DISCON_IND) was received in
* strsock_proto while the lock was dropped above, the unbind
* is allowed to complete.
*/
if (!(flags & _SOUNBIND_REBIND)) {
/*
* Clear out bound address.
*/
vnode_t *vp;
if ((vp = sti->sti_ux_bound_vp) != NULL) {
/* Undo any SSL proxy setup */
if ((so->so_family == AF_INET ||
so->so_family == AF_INET6) &&
(so->so_type == SOCK_STREAM) &&
(sti->sti_kssl_ent != NULL)) {
kssl_release_ent(sti->sti_kssl_ent, so,
sti->sti_kssl_type);
sti->sti_kssl_ent = NULL;
sti->sti_kssl_type = KSSL_NO_PROXY;
}
sti->sti_ux_bound_vp = NULL;
vn_rele_stream(vp);
}
/* Clear out address */
sti->sti_laddr_len = 0;
}
so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
sti->sti_laddr_valid = 0;
done:
/* If the caller held the lock don't release it here */
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so->so_flag & SOLOCKED);
return (error);
}
/*
* listen on the socket.
* For TPI conforming transports this has to first unbind with the transport
* and then bind again using the new backlog.
*/
/* ARGSUSED */
int
sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
{
int error = 0;
sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
(void *)so, backlog, pr_state(so->so_state, so->so_mode)));
if (sti->sti_serv_type == T_CLTS)
return (EOPNOTSUPP);
/*
* If the socket is ready to accept connections already, then
* return without doing anything. This avoids a problem where
* a second listen() call fails if a connection is pending and
* leaves the socket unbound. Only when we are not unbinding
* with the transport can we safely increase the backlog.
*/
if (so->so_state & SS_ACCEPTCONN &&
!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
/*CONSTCOND*/
!solisten_tpi_tcp))
return (0);
if (so->so_state & SS_ISCONNECTED)
return (EINVAL);
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
/*
* If the listen doesn't change the backlog we do nothing.
* This avoids an EPROTO error from the transport.
*/
if ((so->so_state & SS_ACCEPTCONN) &&
so->so_backlog == backlog)
goto done;
if (!(so->so_state & SS_ISBOUND)) {
/*
* Must have been explicitly bound in the UNIX domain.
*/
if (so->so_family == AF_UNIX) {
error = EINVAL;
goto done;
}
error = sotpi_bindlisten(so, NULL, 0, backlog,
_SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
} else if (backlog > 0) {
/*
* AF_INET{,6} hack to avoid losing the port.
* Assumes that all AF_INET{,6} transports can handle a
* O_T_BIND_REQ with a non-zero CONIND_number when the TPI
* has already bound thus it is possible to avoid the unbind.
*/
if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
/*CONSTCOND*/
!solisten_tpi_tcp)) {
error = sotpi_unbind(so, _SOUNBIND_REBIND);
if (error)
goto done;
}
error = sotpi_bindlisten(so, NULL, 0, backlog,
_SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
} else {
so->so_state |= SS_ACCEPTCONN;
so->so_backlog = backlog;
}
if (error)
goto done;
ASSERT(so->so_state & SS_ACCEPTCONN);
done:
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
return (error);
}
/*
* Disconnect either a specified seqno or all (-1).
* The former is used on listening sockets only.
*
* When seqno == -1 sodisconnect could call sotpi_unbind. However,
* the current use of sodisconnect(seqno == -1) is only for shutdown
* so there is no point (and potentially incorrect) to unbind.
*/
static int
sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
{
struct T_discon_req discon_req;
int error = 0;
mblk_t *mp;
dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
(void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
if (!(flags & _SODISCONNECT_LOCK_HELD)) {
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
} else {
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so->so_flag & SOLOCKED);
}
if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
error = EINVAL;
eprintsoline(so, error);
goto done;
}
mutex_exit(&so->so_lock);
/*
* Flush the write side (unless this is a listener)
* and then send down a T_DISCON_REQ.
* (Don't flush on listener since it could flush {O_}T_CONN_RES
* and other messages.)
*/
if (!(so->so_state & SS_ACCEPTCONN))
(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
discon_req.PRIM_type = T_DISCON_REQ;
discon_req.SEQ_number = seqno;
mp = soallocproto1(&discon_req, sizeof (discon_req),
0, _ALLOC_SLEEP);
error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
mutex_enter(&so->so_lock);
if (error) {
eprintsoline(so, error);
goto done;
}
error = sowaitokack(so, T_DISCON_REQ);
if (error) {
eprintsoline(so, error);
goto done;
}
/*
* Even if some TPI message (e.g. T_DISCON_IND) was received in
* strsock_proto while the lock was dropped above, the disconnect
* is allowed to complete. However, it is not possible to
* assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
*/
so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
SOTOTPI(so)->sti_laddr_valid = 0;
SOTOTPI(so)->sti_faddr_valid = 0;
done:
if (!(flags & _SODISCONNECT_LOCK_HELD)) {
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
} else {
/* If the caller held the lock don't release it here */
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so->so_flag & SOLOCKED);
}
return (error);
}
/* ARGSUSED */
int
sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
struct sonode **nsop)
{
struct T_conn_ind *conn_ind;
struct T_conn_res *conn_res;
int error = 0;
mblk_t *mp, *ctxmp, *ack_mp;
struct sonode *nso;
vnode_t *nvp;
void *src;
t_uscalar_t srclen;
void *opt;
t_uscalar_t optlen;
t_scalar_t PRIM_type;
t_scalar_t SEQ_number;
size_t sinlen;
sotpi_info_t *sti = SOTOTPI(so);
sotpi_info_t *nsti;
dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
(void *)so, fflag, (void *)nsop,
pr_state(so->so_state, so->so_mode)));
/*
* Defer single-threading the accepting socket until
* the T_CONN_IND has been received and parsed and the
* new sonode has been opened.
*/
/* Check that we are not already connected */
if ((so->so_state & SS_ACCEPTCONN) == 0)
goto conn_bad;
again:
if ((error = sowaitconnind(so, fflag, &mp)) != 0)
goto e_bad;
ASSERT(mp != NULL);
conn_ind = (struct T_conn_ind *)mp->b_rptr;
ctxmp = mp->b_cont;
/*
* Save SEQ_number for error paths.
*/
SEQ_number = conn_ind->SEQ_number;
srclen = conn_ind->SRC_length;
src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
if (src == NULL) {
error = EPROTO;
freemsg(mp);
eprintsoline(so, error);
goto disconnect_unlocked;
}
optlen = conn_ind->OPT_length;
switch (so->so_family) {
case AF_INET:
case AF_INET6:
if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
bcopy(mp->b_rptr + conn_ind->OPT_offset,
&opt, conn_ind->OPT_length);
} else {
/*
* The transport (in this case TCP) hasn't sent up
* a pointer to an instance for the accept fast-path.
* Disable fast-path completely because the call to
* sotpi_create() below would otherwise create an
* incomplete TCP instance, which would lead to
* problems when sockfs sends a normal T_CONN_RES
* message down the new stream.
*/
if (sti->sti_direct) {
int rval;
/*
* For consistency we inform tcp to disable
* direct interface on the listener, though
* we can certainly live without doing this
* because no data will ever travel upstream
* on the listening socket.
*/
sti->sti_direct = 0;
(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
0, 0, K_TO_K, CRED(), &rval);
}
opt = NULL;
optlen = 0;
}
break;
case AF_UNIX:
default:
if (optlen != 0) {
opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
__TPI_ALIGN_SIZE);
if (opt == NULL) {
error = EPROTO;
freemsg(mp);
eprintsoline(so, error);
goto disconnect_unlocked;
}
}
if (so->so_family == AF_UNIX) {
if (!sti->sti_faddr_noxlate) {
src = NULL;
srclen = 0;
}
/* Extract src address from options */
if (optlen != 0)
so_getopt_srcaddr(opt, optlen, &src, &srclen);
}
break;
}
/*
* Create the new socket.
*/
nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
if (nso == NULL) {
ASSERT(error != 0);
/*
* Accept can not fail with ENOBUFS. sotpi_create
* sleeps waiting for memory until a signal is caught
* so return EINTR.
*/
freemsg(mp);
if (error == ENOBUFS)
error = EINTR;
goto e_disc_unl;
}
nvp = SOTOV(nso);
nsti = SOTOTPI(nso);
/*
* If the transport sent up an SSL connection context, then attach
* it the new socket, and set the (sd_wputdatafunc)() and
* (sd_rputdatafunc)() stream head hooks to intercept and process
* SSL records.
*/
if (ctxmp != NULL) {
/*
* This kssl_ctx_t is already held for us by the transport.
* So, we don't need to do a kssl_hold_ctx() here.
*/
nsti->sti_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
freemsg(ctxmp);
mp->b_cont = NULL;
strsetrwputdatahooks(nvp, strsock_kssl_input,
strsock_kssl_output);
/* Disable sodirect if any */
if (nso->so_direct != NULL) {
mutex_enter(nso->so_direct->sod_lockp);
SOD_DISABLE(nso->so_direct);
mutex_exit(nso->so_direct->sod_lockp);
}
}
#ifdef DEBUG
/*
* SO_DEBUG is used to trigger the dprint* and eprint* macros thus
* it's inherited early to allow debugging of the accept code itself.
*/
nso->so_options |= so->so_options & SO_DEBUG;
#endif /* DEBUG */
/*
* Save the SRC address from the T_CONN_IND
* for getpeername to work on AF_UNIX and on transports that do not
* support TI_GETPEERNAME.
*
* NOTE: AF_UNIX NUL termination is ensured by the sender's
* copyin_name().
*/
if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
error = EINVAL;
freemsg(mp);
eprintsoline(so, error);
goto disconnect_vp_unlocked;
}
nsti->sti_faddr_len = (socklen_t)srclen;
ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
bcopy(src, nsti->sti_faddr_sa, srclen);
nsti->sti_faddr_valid = 1;
if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
(sizeof (struct T_conn_res) + sizeof (intptr_t))) {
cred_t *cr;
if ((cr = DB_CRED(mp)) != NULL) {
crhold(cr);
nso->so_peercred = cr;
nso->so_cpid = DB_CPID(mp);
}
freemsg(mp);
mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
sizeof (intptr_t), 0, _ALLOC_INTR);
if (mp == NULL) {
/*
* Accept can not fail with ENOBUFS.
* A signal was caught so return EINTR.
*/
error = EINTR;
eprintsoline(so, error);
goto disconnect_vp_unlocked;
}
conn_res = (struct T_conn_res *)mp->b_rptr;
} else {
nso->so_peercred = DB_CRED(mp);
nso->so_cpid = DB_CPID(mp);
DB_CRED(mp) = NULL;
mp->b_rptr = DB_BASE(mp);
conn_res = (struct T_conn_res *)mp->b_rptr;
mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
}
/*
* New socket must be bound at least in sockfs and, except for AF_INET,
* (or AF_INET6) it also has to be bound in the transport provider.
* We set the local address in the sonode from the T_OK_ACK of the
* T_CONN_RES. For this reason the address we bind to here isn't
* important.
*/
if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
/*CONSTCOND*/
nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
/*
* Optimization for AF_INET{,6} transports
* that can handle a T_CONN_RES without being bound.
*/
mutex_enter(&nso->so_lock);
so_automatic_bind(nso);
mutex_exit(&nso->so_lock);
} else {
/* Perform NULL bind with the transport provider. */
if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
cr)) != 0) {
ASSERT(error != ENOBUFS);
freemsg(mp);
eprintsoline(nso, error);
goto disconnect_vp_unlocked;
}
}
/*
* Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
* so that any data arriving on the new socket will cause the
* appropriate signals to be delivered for the new socket.
*
* No other thread (except strsock_proto and strsock_misc)
* can access the new socket thus we relax the locking.
*/
nso->so_pgrp = so->so_pgrp;
nso->so_state |= so->so_state & SS_ASYNC;
nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
if (nso->so_pgrp != 0) {
if ((error = so_set_events(nso, nvp, CRED())) != 0) {
eprintsoline(nso, error);
error = 0;
nso->so_pgrp = 0;
}
}
/*
* Make note of the socket level options. TCP and IP level options
* are already inherited. We could do all this after accept is
* successful but doing it here simplifies code and no harm done
* for error case.
*/
nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
nso->so_sndbuf = so->so_sndbuf;
nso->so_rcvbuf = so->so_rcvbuf;
if (nso->so_options & SO_LINGER)
nso->so_linger = so->so_linger;
/*
* Note that the following sti_direct code path should be
* removed once we are confident that the direct sockets
* do not result in any degradation.
*/
if (sti->sti_direct) {
ASSERT(opt != NULL);
conn_res->OPT_length = optlen;
conn_res->OPT_offset = MBLKL(mp);
bcopy(&opt, mp->b_wptr, optlen);
mp->b_wptr += optlen;
conn_res->PRIM_type = T_CONN_RES;
conn_res->ACCEPTOR_id = 0;
PRIM_type = T_CONN_RES;
/* Send down the T_CONN_RES on acceptor STREAM */
error = kstrputmsg(SOTOV(nso), mp, NULL,
0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
if (error) {
mutex_enter(&so->so_lock);
so_lock_single(so);
eprintsoline(so, error);
goto disconnect_vp;
}
mutex_enter(&nso->so_lock);
error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
(t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
if (error) {
mutex_exit(&nso->so_lock);
mutex_enter(&so->so_lock);
so_lock_single(so);
eprintsoline(so, error);
goto disconnect_vp;
}
if (nso->so_family == AF_INET) {
sin_t *sin;
sin = (sin_t *)(ack_mp->b_rptr +
sizeof (struct T_ok_ack));
bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
nsti->sti_laddr_len = sizeof (sin_t);
} else {
sin6_t *sin6;
sin6 = (sin6_t *)(ack_mp->b_rptr +
sizeof (struct T_ok_ack));
bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
nsti->sti_laddr_len = sizeof (sin6_t);
}
freemsg(ack_mp);
nso->so_state |= SS_ISCONNECTED;
nso->so_proto_handle = (sock_lower_handle_t)opt;
nsti->sti_laddr_valid = 1;
if (sti->sti_nl7c_flags & NL7C_ENABLED) {
/*
* A NL7C marked listen()er so the new socket
* inherits the listen()er's NL7C state, except
* for NL7C_POLLIN.
*
* Only call NL7C to process the new socket if
* the listen socket allows blocking i/o.
*/
nsti->sti_nl7c_flags =
sti->sti_nl7c_flags & (~NL7C_POLLIN);
if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
/*
* Nonblocking accept() just make it
* persist to defer processing to the
* read-side syscall (e.g. read).
*/
nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
} else if (nl7c_process(nso, B_FALSE)) {
/*
* NL7C has completed processing on the
* socket, close the socket and back to
* the top to await the next T_CONN_IND.
*/
mutex_exit(&nso->so_lock);
(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
CRED(), NULL);
VN_RELE(nvp);
goto again;
}
/* Pass the new socket out */
}
mutex_exit(&nso->so_lock);
/*
* It's possible, through the use of autopush for example,
* that the acceptor stream may not support sti_direct
* semantics. If the new socket does not support sti_direct
* we issue a _SIOCSOCKFALLBACK to inform the transport
* as we would in the I_PUSH case.
*/
if (nsti->sti_direct == 0) {
int rval;
if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
0, 0, K_TO_K, CRED(), &rval)) != 0) {
mutex_enter(&so->so_lock);
so_lock_single(so);
eprintsoline(so, error);
goto disconnect_vp;
}
}
/*
* Pass out new socket.
*/
if (nsop != NULL)
*nsop = nso;
return (0);
}
/*
* This is the non-performance case for sockets (e.g. AF_UNIX sockets)
* which don't support the FireEngine accept fast-path. It is also
* used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
* again. Neither sockfs nor TCP attempt to find out if some other
* random module has been inserted in between (in which case we
* should follow TLI accept behaviour). We blindly assume the worst
* case and revert back to old behaviour i.e. TCP will not send us
* any option (eager) and the accept should happen on the listener
* queue. Any queued T_conn_ind have already got their options removed
* by so_sock2_stream() when "sockmod" was I_POP'd.
*/
/*
* Fill in the {O_}T_CONN_RES before getting SOLOCKED.
*/
if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
#ifdef _ILP32
queue_t *q;
/*
* Find read queue in driver
* Can safely do this since we "own" nso/nvp.
*/
q = strvp2wq(nvp)->q_next;
while (SAMESTR(q))
q = q->q_next;
q = RD(q);
conn_res->ACCEPTOR_id = (t_uscalar_t)q;
#else
conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
#endif /* _ILP32 */
conn_res->PRIM_type = O_T_CONN_RES;
PRIM_type = O_T_CONN_RES;
} else {
conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
conn_res->PRIM_type = T_CONN_RES;
PRIM_type = T_CONN_RES;
}
conn_res->SEQ_number = SEQ_number;
conn_res->OPT_length = 0;
conn_res->OPT_offset = 0;
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
mutex_exit(&so->so_lock);
error = kstrputmsg(SOTOV(so), mp, NULL,
0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
mutex_enter(&so->so_lock);
if (error) {
eprintsoline(so, error);
goto disconnect_vp;
}
error = sowaitprim(so, PRIM_type, T_OK_ACK,
(t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
if (error) {
eprintsoline(so, error);
goto disconnect_vp;
}
/*
* If there is a sin/sin6 appended onto the T_OK_ACK use
* that to set the local address. If this is not present
* then we zero out the address and don't set the
* sti_laddr_valid bit. For AF_UNIX endpoints we copy over
* the pathname from the listening socket.
*/
sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
ack_mp->b_rptr += sizeof (struct T_ok_ack);
bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
nsti->sti_laddr_len = sinlen;
nsti->sti_laddr_valid = 1;
} else if (nso->so_family == AF_UNIX) {
ASSERT(so->so_family == AF_UNIX);
nsti->sti_laddr_len = sti->sti_laddr_len;
ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
nsti->sti_laddr_len);
nsti->sti_laddr_valid = 1;
} else {
nsti->sti_laddr_len = sti->sti_laddr_len;
ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
nsti->sti_laddr_sa->sa_family = nso->so_family;
}
freemsg(ack_mp);
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
nso->so_state |= SS_ISCONNECTED;
/*
* Pass out new socket.
*/
if (nsop != NULL)
*nsop = nso;
return (0);
eproto_disc_unl:
error = EPROTO;
e_disc_unl:
eprintsoline(so, error);
goto disconnect_unlocked;
pr_disc_vp_unl:
eprintsoline(so, error);
disconnect_vp_unlocked:
(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
VN_RELE(nvp);
disconnect_unlocked:
(void) sodisconnect(so, SEQ_number, 0);
return (error);
pr_disc_vp:
eprintsoline(so, error);
disconnect_vp:
(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
VN_RELE(nvp);
return (error);
conn_bad: /* Note: SunOS 4/BSD unconditionally returns EINVAL here */
error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
? EOPNOTSUPP : EINVAL;
e_bad:
eprintsoline(so, error);
return (error);
}
/*
* connect a socket.
*
* Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
* unconnect (by specifying a null address).
*/
int
sotpi_connect(struct sonode *so,
const struct sockaddr *name,
socklen_t namelen,
int fflag,
int flags,
struct cred *cr)
{
struct T_conn_req conn_req;
int error = 0;
mblk_t *mp;
void *src;
socklen_t srclen;
void *addr;
socklen_t addrlen;
boolean_t need_unlock;
sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
(void *)so, (void *)name, namelen, fflag, flags,
pr_state(so->so_state, so->so_mode)));
/*
* Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
* avoid sleeping for memory with SOLOCKED held.
* We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
* + sizeof (struct T_opthdr).
* (the AF_UNIX so_ux_addr_xlate() does not make the address
* exceed sti_faddr_maxlen).
*/
mp = soallocproto(sizeof (struct T_conn_req) +
2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR);
if (mp == NULL) {
/*
* Connect can not fail with ENOBUFS. A signal was
* caught so return EINTR.
*/
error = EINTR;
eprintsoline(so, error);
return (error);
}
mutex_enter(&so->so_lock);
/*
* Make sure there is a preallocated T_unbind_req message
* before any binding. This message is allocated when the
* socket is created. Since another thread can consume
* so_unbind_mp by the time we return from so_lock_single(),
* we should check the availability of so_unbind_mp after
* we return from so_lock_single().
*/
so_lock_single(so); /* Set SOLOCKED */
need_unlock = B_TRUE;
if (sti->sti_unbind_mp == NULL) {
dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
/* NOTE: holding so_lock while sleeping */
sti->sti_unbind_mp =
soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR);
if (sti->sti_unbind_mp == NULL) {
error = EINTR;
goto done;
}
}
/*
* Can't have done a listen before connecting.
*/
if (so->so_state & SS_ACCEPTCONN) {
error = EOPNOTSUPP;
goto done;
}
/*
* Must be bound with the transport
*/
if (!(so->so_state & SS_ISBOUND)) {
if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
/*CONSTCOND*/
so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
/*
* Optimization for AF_INET{,6} transports
* that can handle a T_CONN_REQ without being bound.
*/
so_automatic_bind(so);
} else {
error = sotpi_bind(so, NULL, 0,
_SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
if (error)
goto done;
}
ASSERT(so->so_state & SS_ISBOUND);
flags |= _SOCONNECT_DID_BIND;
}
/*
* Handle a connect to a name parameter of type AF_UNSPEC like a
* connect to a null address. This is the portable method to
* unconnect a socket.
*/
if ((namelen >= sizeof (sa_family_t)) &&
(name->sa_family == AF_UNSPEC)) {
name = NULL;
namelen = 0;
}
/*
* Check that we are not already connected.
* A connection-oriented socket cannot be reconnected.
* A connected connection-less socket can be
* - connected to a different address by a subsequent connect
* - "unconnected" by a connect to the NULL address
*/
if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
ASSERT(!(flags & _SOCONNECT_DID_BIND));
if (so->so_mode & SM_CONNREQUIRED) {
/* Connection-oriented socket */
error = so->so_state & SS_ISCONNECTED ?
EISCONN : EALREADY;
goto done;
}
/* Connection-less socket */
if (name == NULL) {
/*
* Remove the connected state and clear SO_DGRAM_ERRIND
* since it was set when the socket was connected.
* If this is UDP also send down a T_DISCON_REQ.
*/
int val;
if ((so->so_family == AF_INET ||
so->so_family == AF_INET6) &&
(so->so_type == SOCK_DGRAM ||
so->so_type == SOCK_RAW) &&
/*CONSTCOND*/
!soconnect_tpi_udp) {
/* XXX What about implicitly unbinding here? */
error = sodisconnect(so, -1,
_SODISCONNECT_LOCK_HELD);
} else {
so->so_state &=
~(SS_ISCONNECTED | SS_ISCONNECTING);
sti->sti_faddr_valid = 0;
sti->sti_faddr_len = 0;
}
/* Remove SOLOCKED since setsockopt will grab it */
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
val = 0;
(void) sotpi_setsockopt(so, SOL_SOCKET,
SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
cr);
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
goto done;
}
}
ASSERT(so->so_state & SS_ISBOUND);
if (name == NULL || namelen == 0) {
error = EINVAL;
goto done;
}
/*
* Mark the socket if sti_faddr_sa represents the transport level
* address.
*/
if (flags & _SOCONNECT_NOXLATE) {
struct sockaddr_ux *soaddr_ux;
ASSERT(so->so_family == AF_UNIX);
if (namelen != sizeof (struct sockaddr_ux)) {
error = EINVAL;
goto done;
}
soaddr_ux = (struct sockaddr_ux *)name;
name = (struct sockaddr *)&soaddr_ux->sou_addr;
namelen = sizeof (soaddr_ux->sou_addr);
sti->sti_faddr_noxlate = 1;
}
/*
* Length and family checks.
*/
error = so_addr_verify(so, name, namelen);
if (error)
goto bad;
/*
* Save foreign address. Needed for AF_UNIX as well as
* transport providers that do not support TI_GETPEERNAME.
* Also used for cached foreign address for TCP and UDP.
*/
if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
error = EINVAL;
goto done;
}
sti->sti_faddr_len = (socklen_t)namelen;
ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
bcopy(name, sti->sti_faddr_sa, namelen);
sti->sti_faddr_valid = 1;
if (so->so_family == AF_UNIX) {
if (sti->sti_faddr_noxlate) {
/*
* Already have a transport internal address. Do not
* pass any (transport internal) source address.
*/
addr = sti->sti_faddr_sa;
addrlen = (t_uscalar_t)sti->sti_faddr_len;
src = NULL;
srclen = 0;
} else {
/*
* Pass the sockaddr_un source address as an option
* and translate the remote address.
* Holding so_lock thus sti_laddr_sa can not change.
*/
src = sti->sti_laddr_sa;
srclen = (t_uscalar_t)sti->sti_laddr_len;
dprintso(so, 1,
("sotpi_connect UNIX: srclen %d, src %p\n",
srclen, src));
error = so_ux_addr_xlate(so,
sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
(flags & _SOCONNECT_XPG4_2),
&addr, &addrlen);
if (error)
goto bad;
}
} else {
addr = sti->sti_faddr_sa;
addrlen = (t_uscalar_t)sti->sti_faddr_len;
src = NULL;
srclen = 0;
}
/*
* When connecting a datagram socket we issue the SO_DGRAM_ERRIND
* option which asks the transport provider to send T_UDERR_IND
* messages. These T_UDERR_IND messages are used to return connected
* style errors (e.g. ECONNRESET) for connected datagram sockets.
*
* In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
* we send down a T_CONN_REQ. This is needed to let the
* transport assign a local address that is consistent with
* the remote address. Applications depend on a getsockname()
* after a connect() to retrieve the "source" IP address for
* the connected socket. Invalidate the cached local address
* to force getsockname() to enquire of the transport.
*/
if (!(so->so_mode & SM_CONNREQUIRED)) {
/*
* Datagram socket.
*/
int32_t val;
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
val = 1;
(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
&val, (t_uscalar_t)sizeof (val), cr);
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
(so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
soconnect_tpi_udp) {
soisconnected(so);
goto done;
}
/*
* Send down T_CONN_REQ etc.
* Clear fflag to avoid returning EWOULDBLOCK.
*/
fflag = 0;
ASSERT(so->so_family != AF_UNIX);
sti->sti_laddr_valid = 0;
} else if (sti->sti_laddr_len != 0) {
/*
* If the local address or port was "any" then it may be
* changed by the transport as a result of the
* connect. Invalidate the cached version if we have one.
*/
switch (so->so_family) {
case AF_INET:
ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
INADDR_ANY ||
((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
sti->sti_laddr_valid = 0;
break;
case AF_INET6:
ASSERT(sti->sti_laddr_len ==
(socklen_t)sizeof (sin6_t));
if (IN6_IS_ADDR_UNSPECIFIED(
&((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
IN6_IS_ADDR_V4MAPPED_ANY(
&((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
sti->sti_laddr_valid = 0;
break;
default:
break;
}
}
/*
* Check for failure of an earlier call
*/
if (so->so_error != 0)
goto so_bad;
/*
* Send down T_CONN_REQ. Message was allocated above.
*/
conn_req.PRIM_type = T_CONN_REQ;
conn_req.DEST_length = addrlen;
conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
if (srclen == 0) {
conn_req.OPT_length = 0;
conn_req.OPT_offset = 0;
soappendmsg(mp, &conn_req, sizeof (conn_req));
soappendmsg(mp, addr, addrlen);
} else {
/*
* There is a AF_UNIX sockaddr_un to include as a source
* address option.
*/
struct T_opthdr toh;
toh.level = SOL_SOCKET;
toh.name = SO_SRCADDR;
toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
toh.status = 0;
conn_req.OPT_length =
(t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
_TPI_ALIGN_TOPT(addrlen));
soappendmsg(mp, &conn_req, sizeof (conn_req));
soappendmsg(mp, addr, addrlen);
mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
soappendmsg(mp, &toh, sizeof (toh));
soappendmsg(mp, src, srclen);
mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
}
/*
* Set SS_ISCONNECTING before sending down the T_CONN_REQ
* in order to have the right state when the T_CONN_CON shows up.
*/
soisconnecting(so);
mutex_exit(&so->so_lock);
if (audit_active)
audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
mp = NULL;
mutex_enter(&so->so_lock);
if (error != 0)
goto bad;
if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
goto bad;
/* Allow other threads to access the socket */
so_unlock_single(so, SOLOCKED);
need_unlock = B_FALSE;
/*
* Wait until we get a T_CONN_CON or an error
*/
if ((error = sowaitconnected(so, fflag, 0)) != 0) {
so_lock_single(so); /* Set SOLOCKED */
need_unlock = B_TRUE;
}
done:
freemsg(mp);
switch (error) {
case EINPROGRESS:
case EALREADY:
case EISCONN:
case EINTR:
/* Non-fatal errors */
sti->sti_laddr_valid = 0;
/* FALLTHRU */
case 0:
break;
default:
ASSERT(need_unlock);
/*
* Fatal errors: clear SS_ISCONNECTING in case it was set,
* and invalidate local-address cache
*/
so->so_state &= ~SS_ISCONNECTING;
sti->sti_laddr_valid = 0;
/* A discon_ind might have already unbound us */
if ((flags & _SOCONNECT_DID_BIND) &&
(so->so_state & SS_ISBOUND)) {
int err;
err = sotpi_unbind(so, 0);
/* LINTED - statement has no conseq */
if (err) {
eprintsoline(so, err);
}
}
break;
}
if (need_unlock)
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
return (error);
so_bad: error = sogeterr(so, B_TRUE);
bad: eprintsoline(so, error);
goto done;
}
/* ARGSUSED */
int
sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
{
struct T_ordrel_req ordrel_req;
mblk_t *mp;
uint_t old_state, state_change;
int error = 0;
sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
(void *)so, how, pr_state(so->so_state, so->so_mode)));
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
/*
* SunOS 4.X has no check for datagram sockets.
* 5.X checks that it is connected (ENOTCONN)
* X/Open requires that we check the connected state.
*/
if (!(so->so_state & SS_ISCONNECTED)) {
if (!xnet_skip_checks) {
error = ENOTCONN;
if (xnet_check_print) {
printf("sockfs: X/Open shutdown check "
"caused ENOTCONN\n");
}
}
goto done;
}
/*
* Record the current state and then perform any state changes.
* Then use the difference between the old and new states to
* determine which messages need to be sent.
* This prevents e.g. duplicate T_ORDREL_REQ when there are
* duplicate calls to shutdown().
*/
old_state = so->so_state;
switch (how) {
case 0:
socantrcvmore(so);
break;
case 1:
socantsendmore(so);
break;
case 2:
socantsendmore(so);
socantrcvmore(so);
break;
default:
error = EINVAL;
goto done;
}
/*
* Assumes that the SS_CANT* flags are never cleared in the above code.
*/
state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
(old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
switch (state_change) {
case 0:
dprintso(so, 1,
("sotpi_shutdown: nothing to send in state 0x%x\n",
so->so_state));
goto done;
case SS_CANTRCVMORE:
mutex_exit(&so->so_lock);
strseteof(SOTOV(so), 1);
/*
* strseteof takes care of read side wakeups,
* pollwakeups, and signals.
*/
/*
* Get the read lock before flushing data to avoid problems
* with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
*/
mutex_enter(&so->so_lock);
(void) so_lock_read(so, 0); /* Set SOREADLOCKED */
mutex_exit(&so->so_lock);
/* Flush read side queue */
strflushrq(SOTOV(so), FLUSHALL);
mutex_enter(&so->so_lock);
so_unlock_read(so); /* Clear SOREADLOCKED */
break;
case SS_CANTSENDMORE:
mutex_exit(&so->so_lock);
strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
mutex_enter(&so->so_lock);
break;
case SS_CANTSENDMORE|SS_CANTRCVMORE:
mutex_exit(&so->so_lock);
strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
strseteof(SOTOV(so), 1);
/*
* strseteof takes care of read side wakeups,
* pollwakeups, and signals.
*/
/*
* Get the read lock before flushing data to avoid problems
* with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
*/
mutex_enter(&so->so_lock);
(void) so_lock_read(so, 0); /* Set SOREADLOCKED */
mutex_exit(&so->so_lock);
/* Flush read side queue */
strflushrq(SOTOV(so), FLUSHALL);
mutex_enter(&so->so_lock);
so_unlock_read(so); /* Clear SOREADLOCKED */
break;
}
ASSERT(MUTEX_HELD(&so->so_lock));
/*
* If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
* was set due to this call and the new state has both of them set:
* Send the AF_UNIX close indication
* For T_COTS send a discon_ind
*
* If cantsend was set due to this call:
* For T_COTSORD send an ordrel_ind
*
* Note that for T_CLTS there is no message sent here.
*/
if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
(SS_CANTRCVMORE|SS_CANTSENDMORE)) {
/*
* For SunOS 4.X compatibility we tell the other end
* that we are unable to receive at this point.
*/
if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
so_unix_close(so);
if (sti->sti_serv_type == T_COTS)
error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
}
if ((state_change & SS_CANTSENDMORE) &&
(sti->sti_serv_type == T_COTS_ORD)) {
/* Send an orderly release */
ordrel_req.PRIM_type = T_ORDREL_REQ;
mutex_exit(&so->so_lock);
mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
0, _ALLOC_SLEEP);
/*
* Send down the T_ORDREL_REQ even if there is flow control.
* This prevents shutdown from blocking.
* Note that there is no T_OK_ACK for ordrel_req.
*/
error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
mutex_enter(&so->so_lock);
if (error) {
eprintsoline(so, error);
goto done;
}
}
done:
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
return (error);
}
/*
* For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
* a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
* that we have closed.
* Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
* T_UNITDATA_REQ containing the same option.
*
* For SOCK_DGRAM half-connections (somebody connected to this end
* but this end is not connect) we don't know where to send any
* SO_UNIX_CLOSE.
*
* We have to ignore stream head errors just in case there has been
* a shutdown(output).
* Ignore any flow control to try to get the message more quickly to the peer.
* While locally ignoring flow control solves the problem when there
* is only the loopback transport on the stream it would not provide
* the correct AF_UNIX socket semantics when one or more modules have
* been pushed.
*/
void
so_unix_close(struct sonode *so)
{
int error;
struct T_opthdr toh;
mblk_t *mp;
sotpi_info_t *sti = SOTOTPI(so);
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so->so_family == AF_UNIX);
if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
(SS_ISCONNECTED|SS_ISBOUND))
return;
dprintso(so, 1, ("so_unix_close(%p) %s\n",
(void *)so, pr_state(so->so_state, so->so_mode)));
toh.level = SOL_SOCKET;
toh.name = SO_UNIX_CLOSE;
/* zero length + header */
toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
toh.status = 0;
if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
struct T_optdata_req tdr;
tdr.PRIM_type = T_OPTDATA_REQ;
tdr.DATA_flag = 0;
tdr.OPT_length = (t_scalar_t)sizeof (toh);
tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
/* NOTE: holding so_lock while sleeping */
mp = soallocproto2(&tdr, sizeof (tdr),
&toh, sizeof (toh), 0, _ALLOC_SLEEP);
} else {
struct T_unitdata_req tudr;
void *addr;
socklen_t addrlen;
void *src;
socklen_t srclen;
struct T_opthdr toh2;
t_scalar_t size;
/* Connecteded DGRAM socket */
/*
* For AF_UNIX the destination address is translated to
* an internal name and the source address is passed as
* an option.
*/
/*
* Length and family checks.
*/
error = so_addr_verify(so, sti->sti_faddr_sa,
(t_uscalar_t)sti->sti_faddr_len);
if (error) {
eprintsoline(so, error);
return;
}
if (sti->sti_faddr_noxlate) {
/*
* Already have a transport internal address. Do not
* pass any (transport internal) source address.
*/
addr = sti->sti_faddr_sa;
addrlen = (t_uscalar_t)sti->sti_faddr_len;
src = NULL;
srclen = 0;
} else {
/*
* Pass the sockaddr_un source address as an option
* and translate the remote address.
* Holding so_lock thus sti_laddr_sa can not change.
*/
src = sti->sti_laddr_sa;
srclen = (socklen_t)sti->sti_laddr_len;
dprintso(so, 1,
("so_ux_close: srclen %d, src %p\n",
srclen, src));
error = so_ux_addr_xlate(so,
sti->sti_faddr_sa,
(socklen_t)sti->sti_faddr_len, 0,
&addr, &addrlen);
if (error) {
eprintsoline(so, error);
return;
}
}
tudr.PRIM_type = T_UNITDATA_REQ;
tudr.DEST_length = addrlen;
tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
if (srclen == 0) {
tudr.OPT_length = (t_scalar_t)sizeof (toh);
tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
_TPI_ALIGN_TOPT(addrlen));
size = tudr.OPT_offset + tudr.OPT_length;
/* NOTE: holding so_lock while sleeping */
mp = soallocproto2(&tudr, sizeof (tudr),
addr, addrlen, size, _ALLOC_SLEEP);
mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
soappendmsg(mp, &toh, sizeof (toh));
} else {
/*
* There is a AF_UNIX sockaddr_un to include as a
* source address option.
*/
tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
_TPI_ALIGN_TOPT(srclen));
tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
_TPI_ALIGN_TOPT(addrlen));
toh2.level = SOL_SOCKET;
toh2.name = SO_SRCADDR;
toh2.len = (t_uscalar_t)(srclen +
sizeof (struct T_opthdr));
toh2.status = 0;
size = tudr.OPT_offset + tudr.OPT_length;
/* NOTE: holding so_lock while sleeping */
mp = soallocproto2(&tudr, sizeof (tudr),
addr, addrlen, size, _ALLOC_SLEEP);
mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
soappendmsg(mp, &toh, sizeof (toh));
soappendmsg(mp, &toh2, sizeof (toh2));
soappendmsg(mp, src, srclen);
mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
}
ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
}
mutex_exit(&so->so_lock);
error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
mutex_enter(&so->so_lock);
}
/*
* Called by sotpi_recvmsg when reading a non-zero amount of data.
* In addition, the caller typically verifies that there is some
* potential state to clear by checking
* if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
* before calling this routine.
* Note that such a check can be made without holding so_lock since
* sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
* decrements sti_oobsigcnt.
*
* When data is read *after* the point that all pending
* oob data has been consumed the oob indication is cleared.
*
* This logic keeps select/poll returning POLLRDBAND and
* SIOCATMARK returning true until we have read past
* the mark.
*/
static void
sorecv_update_oobstate(struct sonode *so)
{
sotpi_info_t *sti = SOTOTPI(so);
mutex_enter(&so->so_lock);
ASSERT(so_verify_oobstate(so));
dprintso(so, 1,
("sorecv_update_oobstate: counts %d/%d state %s\n",
sti->sti_oobsigcnt,
sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
if (sti->sti_oobsigcnt == 0) {
/* No more pending oob indications */
so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
freemsg(so->so_oobmsg);
so->so_oobmsg = NULL;
}
ASSERT(so_verify_oobstate(so));
mutex_exit(&so->so_lock);
}
/*
* Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
*/
static int
nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
{
sotpi_info_t *sti = SOTOTPI(so);
int error = 0;
mblk_t *tmp = NULL;
mblk_t *pmp = NULL;
mblk_t *nmp = sti->sti_nl7c_rcv_mp;
ASSERT(nmp != NULL);
while (nmp != NULL && uiop->uio_resid > 0) {
ssize_t n;
if (DB_TYPE(nmp) == M_DATA) {
/*
* We have some data, uiomove up to resid bytes.
*/
n = MIN(MBLKL(nmp), uiop->uio_resid);
if (n > 0)
error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
nmp->b_rptr += n;
if (nmp->b_rptr == nmp->b_wptr) {
pmp = nmp;
nmp = nmp->b_cont;
}
if (error)
break;
} else {
/*
* We only handle data, save for caller to handle.
*/
if (pmp != NULL) {
pmp->b_cont = nmp->b_cont;
}
nmp->b_cont = NULL;
if (*rmp == NULL) {
*rmp = nmp;
} else {
tmp->b_cont = nmp;
}
nmp = nmp->b_cont;
tmp = nmp;
}
}
if (pmp != NULL) {
/* Free any mblk_t(s) which we have consumed */
pmp->b_cont = NULL;
freemsg(sti->sti_nl7c_rcv_mp);
}
if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
/* Last mblk_t so return the saved kstrgetmsg() rval/error */
if (error == 0) {
rval_t *p = (rval_t *)&sti->sti_nl7c_rcv_rval;
error = p->r_v.r_v2;
p->r_v.r_v2 = 0;
}
rp->r_vals = sti->sti_nl7c_rcv_rval;
sti->sti_nl7c_rcv_rval = 0;
} else {
/* More mblk_t(s) to process so no rval to return */
rp->r_vals = 0;
}
return (error);
}
/*
* Receive the next message on the queue.
* If msg_controllen is non-zero when called the caller is interested in
* any received control info (options).
* If msg_namelen is non-zero when called the caller is interested in
* any received source address.
* The routine returns with msg_control and msg_name pointing to
* kmem_alloc'ed memory which the caller has to free.
*/
/* ARGSUSED */
int
sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
struct cred *cr)
{
union T_primitives *tpr;
mblk_t *mp;
uchar_t pri;
int pflag, opflag;
void *control;
t_uscalar_t controllen;
t_uscalar_t namelen;
int so_state = so->so_state; /* Snapshot */
ssize_t saved_resid;
rval_t rval;
int flags;
clock_t timout;
int error = 0;
int reterr = 0;
struct uio *suiop = NULL;
sotpi_info_t *sti = SOTOTPI(so);
flags = msg->msg_flags;
msg->msg_flags = 0;
dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
(void *)so, (void *)msg, flags,
pr_state(so->so_state, so->so_mode), so->so_error));
if (so->so_version == SOV_STREAM) {
so_update_attrs(so, SOACC);
/* The imaginary "sockmod" has been popped - act as a stream */
return (strread(SOTOV(so), uiop, cr));
}
/*
* If we are not connected because we have never been connected
* we return ENOTCONN. If we have been connected (but are no longer
* connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
* the EOF.
*
* An alternative would be to post an ENOTCONN error in stream head
* (read+write) and clear it when we're connected. However, that error
* would cause incorrect poll/select behavior!
*/
if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
(so->so_mode & SM_CONNREQUIRED)) {
return (ENOTCONN);
}
/*
* Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
* after checking that the read queue is empty) and returns zero.
* This implementation will sleep (in kstrgetmsg) even if uio_resid
* is zero.
*/
if (flags & MSG_OOB) {
/* Check that the transport supports OOB */
if (!(so->so_mode & SM_EXDATA))
return (EOPNOTSUPP);
so_update_attrs(so, SOACC);
return (sorecvoob(so, msg, uiop, flags,
(so->so_options & SO_OOBINLINE)));
}
so_update_attrs(so, SOACC);
/*
* Set msg_controllen and msg_namelen to zero here to make it
* simpler in the cases that no control or name is returned.
*/
controllen = msg->msg_controllen;
namelen = msg->msg_namelen;
msg->msg_controllen = 0;
msg->msg_namelen = 0;
dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
namelen, controllen));
mutex_enter(&so->so_lock);
/*
* If an NL7C enabled socket and not waiting for write data.
*/
if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
NL7C_ENABLED) {
if (sti->sti_nl7c_uri) {
/* Close uri processing for a previous request */
nl7c_close(so);
}
if ((so_state & SS_CANTRCVMORE) &&
sti->sti_nl7c_rcv_mp == NULL) {
/* Nothing to process, EOF */
mutex_exit(&so->so_lock);
return (0);
} else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
/* Persistent NL7C socket, try to process request */
boolean_t ret;
ret = nl7c_process(so,
(so->so_state & (SS_NONBLOCK|SS_NDELAY)));
rval.r_vals = sti->sti_nl7c_rcv_rval;
error = rval.r_v.r_v2;
if (error) {
/* Error of some sort, return it */
mutex_exit(&so->so_lock);
return (error);
}
if (sti->sti_nl7c_flags &&
! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
/*
* Still an NL7C socket and no data
* to pass up to the caller.
*/
mutex_exit(&so->so_lock);
if (ret) {
/* EOF */
return (0);
} else {
/* Need more data */
return (EAGAIN);
}
}
} else {
/*
* Not persistent so no further NL7C processing.
*/
sti->sti_nl7c_flags = 0;
}
}
/*
* Only one reader is allowed at any given time. This is needed
* for T_EXDATA handling and, in the future, MSG_WAITALL.
*
* This is slightly different that BSD behavior in that it fails with
* EWOULDBLOCK when using nonblocking io. In BSD the read queue access
* is single-threaded using sblock(), which is dropped while waiting
* for data to appear. The difference shows up e.g. if one
* file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
* does use nonblocking io and different threads are reading each
* file descriptor. In BSD there would never be an EWOULDBLOCK error
* in this case as long as the read queue doesn't get empty.
* In this implementation the thread using nonblocking io can
* get an EWOULDBLOCK error due to the blocking thread executing
* e.g. in the uiomove in kstrgetmsg.
* This difference is not believed to be significant.
*/
/* Set SOREADLOCKED */
error = so_lock_read_intr(so,
uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
mutex_exit(&so->so_lock);
if (error)
return (error);
/*
* Tell kstrgetmsg to not inspect the stream head errors until all
* queued data has been consumed.
* Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
* Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
*
* MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
* to T_OPTDATA_IND that do not contain any user-visible control msg.
* Note that MSG_WAITALL set with MSG_PEEK is a noop.
*/
pflag = MSG_ANY | MSG_DELAYERROR;
if (flags & MSG_PEEK) {
pflag |= MSG_IPEEK;
flags &= ~MSG_WAITALL;
}
if (so->so_mode & SM_ATOMIC)
pflag |= MSG_DISCARDTAIL;
if (flags & MSG_DONTWAIT)
timout = 0;
else
timout = -1;
opflag = pflag;
suiop = sod_rcv_init(so, flags, &uiop);
retry:
saved_resid = uiop->uio_resid;
pri = 0;
mp = NULL;
if (sti->sti_nl7c_rcv_mp != NULL) {
/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
error = nl7c_sorecv(so, &mp, uiop, &rval);
} else {
error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
timout, &rval);
}
if (error != 0) {
/* kstrgetmsg returns ETIME when timeout expires */
if (error == ETIME)
error = EWOULDBLOCK;
goto out;
}
/*
* For datagrams the MOREDATA flag is used to set MSG_TRUNC.
* For non-datagrams MOREDATA is used to set MSG_EOR.
*/
ASSERT(!(rval.r_val1 & MORECTL));
if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
msg->msg_flags |= MSG_TRUNC;
if (mp == NULL) {
dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
/*
* 4.3BSD and 4.4BSD clears the mark when peeking across it.
* The draft Posix socket spec states that the mark should
* not be cleared when peeking. We follow the latter.
*/
if ((so->so_state &
(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
(uiop->uio_resid != saved_resid) &&
!(flags & MSG_PEEK)) {
sorecv_update_oobstate(so);
}
mutex_enter(&so->so_lock);
/* Set MSG_EOR based on MOREDATA */
if (!(rval.r_val1 & MOREDATA)) {
if (so->so_state & SS_SAVEDEOR) {
msg->msg_flags |= MSG_EOR;
so->so_state &= ~SS_SAVEDEOR;
}
}
/*
* If some data was received (i.e. not EOF) and the
* read/recv* has not been satisfied wait for some more.
*/
if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
mutex_exit(&so->so_lock);
pflag = opflag | MSG_NOMARK;
goto retry;
}
goto out_locked;
}
/* strsock_proto has already verified length and alignment */
tpr = (union T_primitives *)mp->b_rptr;
dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
switch (tpr->type) {
case T_DATA_IND: {
if ((so->so_state &
(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
(uiop->uio_resid != saved_resid) &&
!(flags & MSG_PEEK)) {
sorecv_update_oobstate(so);
}
/*
* Set msg_flags to MSG_EOR based on
* MORE_flag and MOREDATA.
*/
mutex_enter(&so->so_lock);
so->so_state &= ~SS_SAVEDEOR;
if (!(tpr->data_ind.MORE_flag & 1)) {
if (!(rval.r_val1 & MOREDATA))
msg->msg_flags |= MSG_EOR;
else
so->so_state |= SS_SAVEDEOR;
}
freemsg(mp);
/*
* If some data was received (i.e. not EOF) and the
* read/recv* has not been satisfied wait for some more.
*/
if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
mutex_exit(&so->so_lock);
pflag = opflag | MSG_NOMARK;
goto retry;
}
goto out_locked;
}
case T_UNITDATA_IND: {
void *addr;
t_uscalar_t addrlen;
void *abuf;
t_uscalar_t optlen;
void *opt;
if ((so->so_state &
(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
(uiop->uio_resid != saved_resid) &&
!(flags & MSG_PEEK)) {
sorecv_update_oobstate(so);
}
if (namelen != 0) {
/* Caller wants source address */
addrlen = tpr->unitdata_ind.SRC_length;
addr = sogetoff(mp,
tpr->unitdata_ind.SRC_offset,
addrlen, 1);
if (addr == NULL) {
freemsg(mp);
error = EPROTO;
eprintsoline(so, error);
goto out;
}
if (so->so_family == AF_UNIX) {
/*
* Can not use the transport level address.
* If there is a SO_SRCADDR option carrying
* the socket level address it will be
* extracted below.
*/
addr = NULL;
addrlen = 0;
}
}
optlen = tpr->unitdata_ind.OPT_length;
if (optlen != 0) {
t_uscalar_t ncontrollen;
/*
* Extract any source address option.
* Determine how large cmsg buffer is needed.
*/
opt = sogetoff(mp,
tpr->unitdata_ind.OPT_offset,
optlen, __TPI_ALIGN_SIZE);
if (opt == NULL) {
freemsg(mp);
error = EPROTO;
eprintsoline(so, error);
goto out;
}
if (so->so_family == AF_UNIX)
so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
ncontrollen = so_cmsglen(mp, opt, optlen,
!(flags & MSG_XPG4_2));
if (controllen != 0)
controllen = ncontrollen;
else if (ncontrollen != 0)
msg->msg_flags |= MSG_CTRUNC;
} else {
controllen = 0;
}
if (namelen != 0) {
/*
* Return address to caller.
* Caller handles truncation if length
* exceeds msg_namelen.
* NOTE: AF_UNIX NUL termination is ensured by
* the sender's copyin_name().
*/
abuf = kmem_alloc(addrlen, KM_SLEEP);
bcopy(addr, abuf, addrlen);
msg->msg_name = abuf;
msg->msg_namelen = addrlen;
}
if (controllen != 0) {
/*
* Return control msg to caller.
* Caller handles truncation if length
* exceeds msg_controllen.
*/
control = kmem_zalloc(controllen, KM_SLEEP);
error = so_opt2cmsg(mp, opt, optlen,
!(flags & MSG_XPG4_2),
control, controllen);
if (error) {
freemsg(mp);
if (msg->msg_namelen != 0)
kmem_free(msg->msg_name,
msg->msg_namelen);
kmem_free(control, controllen);
eprintsoline(so, error);
goto out;
}
msg->msg_control = control;
msg->msg_controllen = controllen;
}
freemsg(mp);
goto out;
}
case T_OPTDATA_IND: {
struct T_optdata_req *tdr;
void *opt;
t_uscalar_t optlen;
if ((so->so_state &
(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
(uiop->uio_resid != saved_resid) &&
!(flags & MSG_PEEK)) {
sorecv_update_oobstate(so);
}
tdr = (struct T_optdata_req *)mp->b_rptr;
optlen = tdr->OPT_length;
if (optlen != 0) {
t_uscalar_t ncontrollen;
/*
* Determine how large cmsg buffer is needed.
*/
opt = sogetoff(mp,
tpr->optdata_ind.OPT_offset,
optlen, __TPI_ALIGN_SIZE);
if (opt == NULL) {
freemsg(mp);
error = EPROTO;
eprintsoline(so, error);
goto out;
}
ncontrollen = so_cmsglen(mp, opt, optlen,
!(flags & MSG_XPG4_2));
if (controllen != 0)
controllen = ncontrollen;
else if (ncontrollen != 0)
msg->msg_flags |= MSG_CTRUNC;
} else {
controllen = 0;
}
if (controllen != 0) {
/*
* Return control msg to caller.
* Caller handles truncation if length
* exceeds msg_controllen.
*/
control = kmem_zalloc(controllen, KM_SLEEP);
error = so_opt2cmsg(mp, opt, optlen,
!(flags & MSG_XPG4_2),
control, controllen);
if (error) {
freemsg(mp);
kmem_free(control, controllen);
eprintsoline(so, error);
goto out;
}
msg->msg_control = control;
msg->msg_controllen = controllen;
}
/*
* Set msg_flags to MSG_EOR based on
* DATA_flag and MOREDATA.
*/
mutex_enter(&so->so_lock);
so->so_state &= ~SS_SAVEDEOR;
if (!(tpr->data_ind.MORE_flag & 1)) {
if (!(rval.r_val1 & MOREDATA))
msg->msg_flags |= MSG_EOR;
else
so->so_state |= SS_SAVEDEOR;
}
freemsg(mp);
/*
* If some data was received (i.e. not EOF) and the
* read/recv* has not been satisfied wait for some more.
* Not possible to wait if control info was received.
*/
if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
controllen == 0 &&
uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
mutex_exit(&so->so_lock);
pflag = opflag | MSG_NOMARK;
goto retry;
}
goto out_locked;
}
case T_EXDATA_IND: {
dprintso(so, 1,
("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
"state %s\n",
sti->sti_oobsigcnt, sti->sti_oobcnt,
saved_resid - uiop->uio_resid,
pr_state(so->so_state, so->so_mode)));
/*
* kstrgetmsg handles MSGMARK so there is nothing to
* inspect in the T_EXDATA_IND.
* strsock_proto makes the stream head queue the T_EXDATA_IND
* as a separate message with no M_DATA component. Furthermore,
* the stream head does not consolidate M_DATA messages onto
* an MSGMARK'ed message ensuring that the T_EXDATA_IND
* remains a message by itself. This is needed since MSGMARK
* marks both the whole message as well as the last byte
* of the message.
*/
freemsg(mp);
ASSERT(uiop->uio_resid == saved_resid); /* No data */
if (flags & MSG_PEEK) {
/*
* Even though we are peeking we consume the
* T_EXDATA_IND thereby moving the mark information
* to SS_RCVATMARK. Then the oob code below will
* retry the peeking kstrgetmsg.
* Note that the stream head read queue is
* never flushed without holding SOREADLOCKED
* thus the T_EXDATA_IND can not disappear
* underneath us.
*/
dprintso(so, 1,
("sotpi_recvmsg: consume EXDATA_IND "
"counts %d/%d state %s\n",
sti->sti_oobsigcnt,
sti->sti_oobcnt,
pr_state(so->so_state, so->so_mode)));
pflag = MSG_ANY | MSG_DELAYERROR;
if (so->so_mode & SM_ATOMIC)
pflag |= MSG_DISCARDTAIL;
pri = 0;
mp = NULL;
error = kstrgetmsg(SOTOV(so), &mp, uiop,
&pri, &pflag, (clock_t)-1, &rval);
ASSERT(uiop->uio_resid == saved_resid);
if (error) {
#ifdef SOCK_DEBUG
if (error != EWOULDBLOCK && error != EINTR) {
eprintsoline(so, error);
}
#endif /* SOCK_DEBUG */
goto out;
}
ASSERT(mp);
tpr = (union T_primitives *)mp->b_rptr;
ASSERT(tpr->type == T_EXDATA_IND);
freemsg(mp);
} /* end "if (flags & MSG_PEEK)" */
/*
* Decrement the number of queued and pending oob.
*
* SS_RCVATMARK is cleared when we read past a mark.
* SS_HAVEOOBDATA is cleared when we've read past the
* last mark.
* SS_OOBPEND is cleared if we've read past the last
* mark and no (new) SIGURG has been posted.
*/
mutex_enter(&so->so_lock);
ASSERT(so_verify_oobstate(so));
ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
ASSERT(sti->sti_oobsigcnt > 0);
sti->sti_oobsigcnt--;
ASSERT(sti->sti_oobcnt > 0);
sti->sti_oobcnt--;
/*
* Since the T_EXDATA_IND has been removed from the stream
* head, but we have not read data past the mark,
* sockfs needs to track that the socket is still at the mark.
*
* Since no data was received call kstrgetmsg again to wait
* for data.
*/
so->so_state |= SS_RCVATMARK;
mutex_exit(&so->so_lock);
dprintso(so, 1,
("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
sti->sti_oobsigcnt, sti->sti_oobcnt,
pr_state(so->so_state, so->so_mode)));
pflag = opflag;
goto retry;
}
default:
cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
(void *)so, tpr->type, (void *)mp);
ASSERT(0);
freemsg(mp);
error = EPROTO;
eprintsoline(so, error);
goto out;
}
/* NOTREACHED */
out:
mutex_enter(&so->so_lock);
out_locked:
if (so->so_direct != NULL) {
mutex_enter(so->so_direct->sod_lockp);
reterr = sod_rcv_done(so, suiop, uiop);
mutex_exit(so->so_direct->sod_lockp);
}
if (reterr != 0 && error == 0)
error = reterr;
so_unlock_read(so); /* Clear SOREADLOCKED */
mutex_exit(&so->so_lock);
return (error);
}
/*
* Sending data with options on a datagram socket.
* Assumes caller has verified that SS_ISBOUND etc. are set.
*/
static int
sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
{
struct T_unitdata_req tudr;
mblk_t *mp;
int error;
void *addr;
socklen_t addrlen;
void *src;
socklen_t srclen;
ssize_t len;
int size;
struct T_opthdr toh;
struct fdbuf *fdbuf;
t_uscalar_t optlen;
void *fds;
int fdlen;
sotpi_info_t *sti = SOTOTPI(so);
ASSERT(name && namelen);
ASSERT(control && controllen);
len = uiop->uio_resid;
if (len > (ssize_t)sti->sti_tidu_size) {
return (EMSGSIZE);
}
/*
* For AF_UNIX the destination address is translated to an internal
* name and the source address is passed as an option.
* Also, file descriptors are passed as file pointers in an
* option.
*/
/*
* Length and family checks.
*/
error = so_addr_verify(so, name, namelen);
if (error) {
eprintsoline(so, error);
return (error);
}
if (so->so_family == AF_UNIX) {
if (sti->sti_faddr_noxlate) {
/*
* Already have a transport internal address. Do not
* pass any (transport internal) source address.
*/
addr = name;
addrlen = namelen;
src = NULL;
srclen = 0;
} else {
/*
* Pass the sockaddr_un source address as an option
* and translate the remote address.
*
* Note that this code does not prevent sti_laddr_sa
* from changing while it is being used. Thus
* if an unbind+bind occurs concurrently with this
* send the peer might see a partially new and a
* partially old "from" address.
*/
src = sti->sti_laddr_sa;
srclen = (t_uscalar_t)sti->sti_laddr_len;
dprintso(so, 1,
("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
srclen, src));
error = so_ux_addr_xlate(so, name, namelen,
(flags & MSG_XPG4_2),
&addr, &addrlen);
if (error) {
eprintsoline(so, error);
return (error);
}
}
} else {
addr = name;
addrlen = namelen;
src = NULL;
srclen = 0;
}
optlen = so_optlen(control, controllen,
!(flags & MSG_XPG4_2));
tudr.PRIM_type = T_UNITDATA_REQ;
tudr.DEST_length = addrlen;
tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
if (srclen != 0)
tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
_TPI_ALIGN_TOPT(srclen));
else
tudr.OPT_length = optlen;
tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
_TPI_ALIGN_TOPT(addrlen));
size = tudr.OPT_offset + tudr.OPT_length;
/*
* File descriptors only when SM_FDPASSING set.
*/
error = so_getfdopt(control, controllen,
!(flags & MSG_XPG4_2), &fds, &fdlen);
if (error)
return (error);
if (fdlen != -1) {
if (!(so->so_mode & SM_FDPASSING))
return (EOPNOTSUPP);
error = fdbuf_create(fds, fdlen, &fdbuf);
if (error)
return (error);
mp = fdbuf_allocmsg(size, fdbuf);
} else {
mp = soallocproto(size, _ALLOC_INTR);
if (mp == NULL) {
/*
* Caught a signal waiting for memory.
* Let send* return EINTR.
*/
return (EINTR);
}
}
soappendmsg(mp, &tudr, sizeof (tudr));
soappendmsg(mp, addr, addrlen);
mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
if (fdlen != -1) {
ASSERT(fdbuf != NULL);
toh.level = SOL_SOCKET;
toh.name = SO_FILEP;
toh.len = fdbuf->fd_size +
(t_uscalar_t)sizeof (struct T_opthdr);
toh.status = 0;
soappendmsg(mp, &toh, sizeof (toh));
soappendmsg(mp, fdbuf, fdbuf->fd_size);
ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
}
if (srclen != 0) {
/*
* There is a AF_UNIX sockaddr_un to include as a source
* address option.
*/
toh.level = SOL_SOCKET;
toh.name = SO_SRCADDR;
toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
toh.status = 0;
soappendmsg(mp, &toh, sizeof (toh));
soappendmsg(mp, src, srclen);
mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
}
ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
/* At most 3 bytes left in the message */
ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
ASSERT(MBLKL(mp) <= (ssize_t)size);
ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
if (audit_active)
audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
#ifdef SOCK_DEBUG
if (error) {
eprintsoline(so, error);
}
#endif /* SOCK_DEBUG */
return (error);
}
/*
* Sending data with options on a connected stream socket.
* Assumes caller has verified that SS_ISCONNECTED is set.
*/
static int
sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
t_uscalar_t controllen, int flags)
{
struct T_optdata_req tdr;
mblk_t *mp;
int error;
ssize_t iosize;
int size;
struct fdbuf *fdbuf;
t_uscalar_t optlen;
void *fds;
int fdlen;
struct T_opthdr toh;
sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1,
("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
/*
* Has to be bound and connected. However, since no locks are
* held the state could have changed after sotpi_sendmsg checked it
* thus it is not possible to ASSERT on the state.
*/
/* Options on connection-oriented only when SM_OPTDATA set. */
if (!(so->so_mode & SM_OPTDATA))
return (EOPNOTSUPP);
do {
/*
* Set the MORE flag if uio_resid does not fit in this
* message or if the caller passed in "more".
* Error for transports with zero tidu_size.
*/
tdr.PRIM_type = T_OPTDATA_REQ;
iosize = sti->sti_tidu_size;
if (iosize <= 0)
return (EMSGSIZE);
if (uiop->uio_resid > iosize) {
tdr.DATA_flag = 1;
} else {
if (more)
tdr.DATA_flag = 1;
else
tdr.DATA_flag = 0;
iosize = uiop->uio_resid;
}
dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
tdr.DATA_flag, iosize));
optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
tdr.OPT_length = optlen;
tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
size = (int)sizeof (tdr) + optlen;
/*
* File descriptors only when SM_FDPASSING set.
*/
error = so_getfdopt(control, controllen,
!(flags & MSG_XPG4_2), &fds, &fdlen);
if (error)
return (error);
if (fdlen != -1) {
if (!(so->so_mode & SM_FDPASSING))
return (EOPNOTSUPP);
error = fdbuf_create(fds, fdlen, &fdbuf);
if (error)
return (error);
mp = fdbuf_allocmsg(size, fdbuf);
} else {
mp = soallocproto(size, _ALLOC_INTR);
if (mp == NULL) {
/*
* Caught a signal waiting for memory.
* Let send* return EINTR.
*/
return (EINTR);
}
}
soappendmsg(mp, &tdr, sizeof (tdr));
if (fdlen != -1) {
ASSERT(fdbuf != NULL);
toh.level = SOL_SOCKET;
toh.name = SO_FILEP;
toh.len = fdbuf->fd_size +
(t_uscalar_t)sizeof (struct T_opthdr);
toh.status = 0;
soappendmsg(mp, &toh, sizeof (toh));
soappendmsg(mp, fdbuf, fdbuf->fd_size);
ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
}
so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
/* At most 3 bytes left in the message */
ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
ASSERT(MBLKL(mp) <= (ssize_t)size);
ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
0, MSG_BAND, 0);
if (error) {
eprintsoline(so, error);
return (error);
}
control = NULL;
if (uiop->uio_resid > 0) {
/*
* Recheck for fatal errors. Fail write even though
* some data have been written. This is consistent
* with strwrite semantics and BSD sockets semantics.
*/
if (so->so_state & SS_CANTSENDMORE) {
eprintsoline(so, error);
return (EPIPE);
}
if (so->so_error != 0) {
mutex_enter(&so->so_lock);
error = sogeterr(so, B_TRUE);
mutex_exit(&so->so_lock);
if (error != 0) {
eprintsoline(so, error);
return (error);
}
}
}
} while (uiop->uio_resid > 0);
return (0);
}
/*
* Sending data on a datagram socket.
* Assumes caller has verified that SS_ISBOUND etc. are set.
*
* For AF_UNIX the destination address is translated to an internal
* name and the source address is passed as an option.
*/
int
sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
struct uio *uiop, int flags)
{
struct T_unitdata_req tudr;
mblk_t *mp;
int error;
void *addr;
socklen_t addrlen;
void *src;
socklen_t srclen;
ssize_t len;
sotpi_info_t *sti = SOTOTPI(so);
ASSERT(name != NULL && namelen != 0);
len = uiop->uio_resid;
if (len > sti->sti_tidu_size) {
error = EMSGSIZE;
goto done;
}
/* Length and family checks */
error = so_addr_verify(so, name, namelen);
if (error != 0)
goto done;
if (sti->sti_direct)
return (sodgram_direct(so, name, namelen, uiop, flags));
if (so->so_family == AF_UNIX) {
if (sti->sti_faddr_noxlate) {
/*
* Already have a transport internal address. Do not
* pass any (transport internal) source address.
*/
addr = name;
addrlen = namelen;
src = NULL;
srclen = 0;
} else {
/*
* Pass the sockaddr_un source address as an option
* and translate the remote address.
*
* Note that this code does not prevent sti_laddr_sa
* from changing while it is being used. Thus
* if an unbind+bind occurs concurrently with this
* send the peer might see a partially new and a
* partially old "from" address.
*/
src = sti->sti_laddr_sa;
srclen = (socklen_t)sti->sti_laddr_len;
dprintso(so, 1,
("sosend_dgram UNIX: srclen %d, src %p\n",
srclen, src));
error = so_ux_addr_xlate(so, name, namelen,
(flags & MSG_XPG4_2),
&addr, &addrlen);
if (error) {
eprintsoline(so, error);
goto done;
}
}
} else {
addr = name;
addrlen = namelen;
src = NULL;
srclen = 0;
}
tudr.PRIM_type = T_UNITDATA_REQ;
tudr.DEST_length = addrlen;
tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
if (srclen == 0) {
tudr.OPT_length = 0;
tudr.OPT_offset = 0;
mp = soallocproto2(&tudr, sizeof (tudr),
addr, addrlen, 0, _ALLOC_INTR);
if (mp == NULL) {
/*
* Caught a signal waiting for memory.
* Let send* return EINTR.
*/
error = EINTR;
goto done;
}
} else {
/*
* There is a AF_UNIX sockaddr_un to include as a source
* address option.
*/
struct T_opthdr toh;
ssize_t size;
tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
_TPI_ALIGN_TOPT(srclen));
tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
_TPI_ALIGN_TOPT(addrlen));
toh.level = SOL_SOCKET;
toh.name = SO_SRCADDR;
toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
toh.status = 0;
size = tudr.OPT_offset + tudr.OPT_length;
mp = soallocproto2(&tudr, sizeof (tudr),
addr, addrlen, size, _ALLOC_INTR);
if (mp == NULL) {
/*
* Caught a signal waiting for memory.
* Let send* return EINTR.
*/
error = EINTR;
goto done;
}
mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
soappendmsg(mp, &toh, sizeof (toh));
soappendmsg(mp, src, srclen);
mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
}
if (audit_active)
audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
done:
#ifdef SOCK_DEBUG
if (error) {
eprintsoline(so, error);
}
#endif /* SOCK_DEBUG */
return (error);
}
/*
* Sending data on a connected stream socket.
* Assumes caller has verified that SS_ISCONNECTED is set.
*/
int
sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
int sflag)
{
struct T_data_req tdr;
mblk_t *mp;
int error;
ssize_t iosize;
sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1,
("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
(void *)so, uiop->uio_resid, prim, sflag));
/*
* Has to be bound and connected. However, since no locks are
* held the state could have changed after sotpi_sendmsg checked it
* thus it is not possible to ASSERT on the state.
*/
do {
/*
* Set the MORE flag if uio_resid does not fit in this
* message or if the caller passed in "more".
* Error for transports with zero tidu_size.
*/
tdr.PRIM_type = prim;
iosize = sti->sti_tidu_size;
if (iosize <= 0)
return (EMSGSIZE);
if (uiop->uio_resid > iosize) {
tdr.MORE_flag = 1;
} else {
if (more)
tdr.MORE_flag = 1;
else
tdr.MORE_flag = 0;
iosize = uiop->uio_resid;
}
dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
prim, tdr.MORE_flag, iosize));
mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR);
if (mp == NULL) {
/*
* Caught a signal waiting for memory.
* Let send* return EINTR.
*/
return (EINTR);
}
error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
0, sflag | MSG_BAND, 0);
if (error) {
eprintsoline(so, error);
return (error);
}
if (uiop->uio_resid > 0) {
/*
* Recheck for fatal errors. Fail write even though
* some data have been written. This is consistent
* with strwrite semantics and BSD sockets semantics.
*/
if (so->so_state & SS_CANTSENDMORE) {
eprintsoline(so, error);
return (EPIPE);
}
if (so->so_error != 0) {
mutex_enter(&so->so_lock);
error = sogeterr(so, B_TRUE);
mutex_exit(&so->so_lock);
if (error != 0) {
eprintsoline(so, error);
return (error);
}
}
}
} while (uiop->uio_resid > 0);
return (0);
}
/*
* Check the state for errors and call the appropriate send function.
*
* If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
* this function issues a setsockopt to toggle SO_DONTROUTE before and
* after sending the message.
*/
static int
sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
struct cred *cr)
{
int so_state;
int so_mode;
int error;
struct sockaddr *name;
t_uscalar_t namelen;
int dontroute;
int flags;
sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
(void *)so, (void *)msg, msg->msg_flags,
pr_state(so->so_state, so->so_mode), so->so_error));
if (so->so_version == SOV_STREAM) {
/* The imaginary "sockmod" has been popped - act as a stream */
so_update_attrs(so, SOMOD);
return (strwrite(SOTOV(so), uiop, cr));
}
mutex_enter(&so->so_lock);
so_state = so->so_state;
if (so_state & SS_CANTSENDMORE) {
mutex_exit(&so->so_lock);
return (EPIPE);
}
if (so->so_error != 0) {
error = sogeterr(so, B_TRUE);
if (error != 0) {
mutex_exit(&so->so_lock);
return (error);
}
}
name = (struct sockaddr *)msg->msg_name;
namelen = msg->msg_namelen;
so_mode = so->so_mode;
if (name == NULL) {
if (!(so_state & SS_ISCONNECTED)) {
mutex_exit(&so->so_lock);
if (so_mode & SM_CONNREQUIRED)
return (ENOTCONN);
else
return (EDESTADDRREQ);
}
if (so_mode & SM_CONNREQUIRED) {
name = NULL;
namelen = 0;
} else {
/*
* Note that this code does not prevent sti_faddr_sa
* from changing while it is being used. Thus
* if an "unconnect"+connect occurs concurrently with
* this send the datagram might be delivered to a
* garbaled address.
*/
ASSERT(sti->sti_faddr_sa);
name = sti->sti_faddr_sa;
namelen = (t_uscalar_t)sti->sti_faddr_len;
}
} else {
if (!(so_state & SS_ISCONNECTED) &&
(so_mode & SM_CONNREQUIRED)) {
/* Required but not connected */
mutex_exit(&so->so_lock);
return (ENOTCONN);
}
/*
* Ignore the address on connection-oriented sockets.
* Just like BSD this code does not generate an error for
* TCP (a CONNREQUIRED socket) when sending to an address
* passed in with sendto/sendmsg. Instead the data is
* delivered on the connection as if no address had been
* supplied.
*/
if ((so_state & SS_ISCONNECTED) &&
!(so_mode & SM_CONNREQUIRED)) {
mutex_exit(&so->so_lock);
return (EISCONN);
}
if (!(so_state & SS_ISBOUND)) {
so_lock_single(so); /* Set SOLOCKED */
error = sotpi_bind(so, NULL, 0,
_SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
so_unlock_single(so, SOLOCKED);
if (error) {
mutex_exit(&so->so_lock);
eprintsoline(so, error);
return (error);
}
}
/*
* Handle delayed datagram errors. These are only queued
* when the application sets SO_DGRAM_ERRIND.
* Return the error if we are sending to the address
* that was returned in the last T_UDERROR_IND.
* If sending to some other address discard the delayed
* error indication.
*/
if (sti->sti_delayed_error) {
struct T_uderror_ind *tudi;
void *addr;
t_uscalar_t addrlen;
boolean_t match = B_FALSE;
ASSERT(sti->sti_eaddr_mp);
error = sti->sti_delayed_error;
sti->sti_delayed_error = 0;
tudi =
(struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
addrlen = tudi->DEST_length;
addr = sogetoff(sti->sti_eaddr_mp,
tudi->DEST_offset, addrlen, 1);
ASSERT(addr); /* Checked by strsock_proto */
switch (so->so_family) {
case AF_INET: {
/* Compare just IP address and port */
sin_t *sin1 = (sin_t *)name;
sin_t *sin2 = (sin_t *)addr;
if (addrlen == sizeof (sin_t) &&
namelen == addrlen &&
sin1->sin_port == sin2->sin_port &&
sin1->sin_addr.s_addr ==
sin2->sin_addr.s_addr)
match = B_TRUE;
break;
}
case AF_INET6: {
/* Compare just IP address and port. Not flow */
sin6_t *sin1 = (sin6_t *)name;
sin6_t *sin2 = (sin6_t *)addr;
if (addrlen == sizeof (sin6_t) &&
namelen == addrlen &&
sin1->sin6_port == sin2->sin6_port &&
IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
&sin2->sin6_addr))
match = B_TRUE;
break;
}
case AF_UNIX:
default:
if (namelen == addrlen &&
bcmp(name, addr, namelen) == 0)
match = B_TRUE;
}
if (match) {
freemsg(sti->sti_eaddr_mp);
sti->sti_eaddr_mp = NULL;
mutex_exit(&so->so_lock);
#ifdef DEBUG
dprintso(so, 0,
("sockfs delayed error %d for %s\n",
error,
pr_addr(so->so_family, name, namelen)));
#endif /* DEBUG */
return (error);
}
freemsg(sti->sti_eaddr_mp);
sti->sti_eaddr_mp = NULL;
}
}
mutex_exit(&so->so_lock);
flags = msg->msg_flags;
dontroute = 0;
if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
uint32_t val;
val = 1;
error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
&val, (t_uscalar_t)sizeof (val), cr);
if (error)
return (error);
dontroute = 1;
}
if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
error = EOPNOTSUPP;
goto done;
}
if (msg->msg_controllen != 0) {
if (!(so_mode & SM_CONNREQUIRED)) {
so_update_attrs(so, SOMOD);
error = sosend_dgramcmsg(so, name, namelen, uiop,
msg->msg_control, msg->msg_controllen, flags);
} else {
if (flags & MSG_OOB) {
/* Can't generate T_EXDATA_REQ with options */
error = EOPNOTSUPP;
goto done;
}
so_update_attrs(so, SOMOD);
error = sosend_svccmsg(so, uiop,
!(flags & MSG_EOR),
msg->msg_control, msg->msg_controllen,
flags);
}
goto done;
}
so_update_attrs(so, SOMOD);
if (!(so_mode & SM_CONNREQUIRED)) {
/*
* If there is no SO_DONTROUTE to turn off return immediately
* from send_dgram. This can allow tail-call optimizations.
*/
if (!dontroute) {
return (sosend_dgram(so, name, namelen, uiop, flags));
}
error = sosend_dgram(so, name, namelen, uiop, flags);
} else {
t_scalar_t prim;
int sflag;
/* Ignore msg_name in the connected state */
if (flags & MSG_OOB) {
prim = T_EXDATA_REQ;
/*
* Send down T_EXDATA_REQ even if there is flow
* control for data.
*/
sflag = MSG_IGNFLOW;
} else {
if (so_mode & SM_BYTESTREAM) {
/* Byte stream transport - use write */
dprintso(so, 1, ("sotpi_sendmsg: write\n"));
/* Send M_DATA messages */
if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
(error = nl7c_data(so, uiop)) >= 0) {
/* NL7C consumed the data */
return (error);
}
/*
* If there is no SO_DONTROUTE to turn off,
* sti_direct is on, and there is no flow
* control, we can take the fast path.
*/
if (!dontroute && sti->sti_direct != 0 &&
canputnext(SOTOV(so)->v_stream->sd_wrq)) {
return (sostream_direct(so, uiop,
NULL, cr));
}
error = strwrite(SOTOV(so), uiop, cr);
goto done;
}
prim = T_DATA_REQ;
sflag = 0;
}
/*
* If there is no SO_DONTROUTE to turn off return immediately
* from sosend_svc. This can allow tail-call optimizations.
*/
if (!dontroute)
return (sosend_svc(so, uiop, prim,
!(flags & MSG_EOR), sflag));
error = sosend_svc(so, uiop, prim,
!(flags & MSG_EOR), sflag);
}
ASSERT(dontroute);
done:
if (dontroute) {
uint32_t val;
val = 0;
(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
&val, (t_uscalar_t)sizeof (val), cr);
}
return (error);
}
/*
* kstrwritemp() has very similar semantics as that of strwrite().
* The main difference is it obtains mblks from the caller and also
* does not do any copy as done in strwrite() from user buffers to
* kernel buffers.
*
* Currently, this routine is used by sendfile to send data allocated
* within the kernel without any copying. This interface does not use the
* synchronous stream interface as synch. stream interface implies
* copying.
*/
int
kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
{
struct stdata *stp;
struct queue *wqp;
mblk_t *newmp;
char waitflag;
int tempmode;
int error = 0;
int done = 0;
struct sonode *so;
boolean_t direct;
ASSERT(vp->v_stream);
stp = vp->v_stream;
so = VTOSO(vp);
direct = _SOTOTPI(so)->sti_direct;
/*
* This is the sockfs direct fast path. canputnext() need
* not be accurate so we don't grab the sd_lock here. If
* we get flow-controlled, we grab sd_lock just before the
* do..while loop below to emulate what strwrite() does.
*/
wqp = stp->sd_wrq;
if (canputnext(wqp) && direct &&
!(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
return (sostream_direct(so, NULL, mp, CRED()));
} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
/* Fast check of flags before acquiring the lock */
mutex_enter(&stp->sd_lock);
error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
mutex_exit(&stp->sd_lock);
if (error != 0) {
if (!(stp->sd_flag & STPLEX) &&
(stp->sd_wput_opt & SW_SIGPIPE)) {
error = EPIPE;
}
return (error);
}
}
waitflag = WRITEWAIT;
if (stp->sd_flag & OLDNDELAY)
tempmode = fmode & ~FNDELAY;
else
tempmode = fmode;
mutex_enter(&stp->sd_lock);
do {
if (canputnext(wqp)) {
mutex_exit(&stp->sd_lock);
if (stp->sd_wputdatafunc != NULL) {
newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
NULL, NULL, NULL);
if (newmp == NULL) {
/* The caller will free mp */
return (ECOMM);
}
mp = newmp;
}
putnext(wqp, mp);
return (0);
}
error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
&done);
} while (error == 0 && !done);
mutex_exit(&stp->sd_lock);
/*
* EAGAIN tells the application to try again. ENOMEM
* is returned only if the memory allocation size
* exceeds the physical limits of the system. ENOMEM
* can't be true here.
*/
if (error == ENOMEM)
error = EAGAIN;
return (error);
}
/* ARGSUSED */
static int
sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
struct cred *cr, mblk_t **mpp)
{
int error;
if (so->so_family != AF_INET && so->so_family != AF_INET6)
return (EAFNOSUPPORT);
if (so->so_state & SS_CANTSENDMORE)
return (EPIPE);
if (so->so_type != SOCK_STREAM)
return (EOPNOTSUPP);
if ((so->so_state & SS_ISCONNECTED) == 0)
return (ENOTCONN);
error = kstrwritemp(so->so_vnode, *mpp, fflag);
if (error == 0)
*mpp = NULL;
return (error);
}
/*
* Sending data on a datagram socket.
* Assumes caller has verified that SS_ISBOUND etc. are set.
*/
/* ARGSUSED */
static int
sodgram_direct(struct sonode *so, struct sockaddr *name,
socklen_t namelen, struct uio *uiop, int flags)
{
struct T_unitdata_req tudr;
mblk_t *mp = NULL;
int error = 0;
void *addr;
socklen_t addrlen;
ssize_t len;
struct stdata *stp = SOTOV(so)->v_stream;
int so_state;
queue_t *udp_wq;
boolean_t connected;
mblk_t *mpdata = NULL;
sotpi_info_t *sti = SOTOTPI(so);
ASSERT(name != NULL && namelen != 0);
ASSERT(!(so->so_mode & SM_CONNREQUIRED));
ASSERT(!(so->so_mode & SM_EXDATA));
ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
ASSERT(SOTOV(so)->v_type == VSOCK);
/* Caller checked for proper length */
len = uiop->uio_resid;
ASSERT(len <= sti->sti_tidu_size);
/* Length and family checks have been done by caller */
ASSERT(name->sa_family == so->so_family);
ASSERT(so->so_family == AF_INET ||
(namelen == (socklen_t)sizeof (struct sockaddr_in6)));
ASSERT(so->so_family == AF_INET6 ||
(namelen == (socklen_t)sizeof (struct sockaddr_in)));
addr = name;
addrlen = namelen;
if (stp->sd_sidp != NULL &&
(error = straccess(stp, JCWRITE)) != 0)
goto done;
so_state = so->so_state;
connected = so_state & SS_ISCONNECTED;
if (!connected) {
tudr.PRIM_type = T_UNITDATA_REQ;
tudr.DEST_length = addrlen;
tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
tudr.OPT_length = 0;
tudr.OPT_offset = 0;
mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
_ALLOC_INTR);
if (mp == NULL) {
/*
* Caught a signal waiting for memory.
* Let send* return EINTR.
*/
error = EINTR;
goto done;
}
}
/*
* For UDP we don't break up the copyin into smaller pieces
* as in the TCP case. That means if ENOMEM is returned by
* mcopyinuio() then the uio vector has not been modified at
* all and we fallback to either strwrite() or kstrputmsg()
* below. Note also that we never generate priority messages
* from here.
*/
udp_wq = stp->sd_wrq->q_next;
if (canput(udp_wq) &&
(mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
ASSERT(DB_TYPE(mpdata) == M_DATA);
ASSERT(uiop->uio_resid == 0);
if (!connected)
linkb(mp, mpdata);
else
mp = mpdata;
if (audit_active)
audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
udp_wput(udp_wq, mp);
return (0);
}
ASSERT(mpdata == NULL);
if (error != 0 && error != ENOMEM) {
freemsg(mp);
return (error);
}
/*
* For connected, let strwrite() handle the blocking case.
* Otherwise we fall thru and use kstrputmsg().
*/
if (connected)
return (strwrite(SOTOV(so), uiop, CRED()));
if (audit_active)
audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
done:
#ifdef SOCK_DEBUG
if (error != 0) {
eprintsoline(so, error);
}
#endif /* SOCK_DEBUG */
return (error);
}
int
sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
{
struct stdata *stp = SOTOV(so)->v_stream;
ssize_t iosize, rmax, maxblk;
queue_t *tcp_wq = stp->sd_wrq->q_next;
mblk_t *newmp;
int error = 0, wflag = 0;
ASSERT(so->so_mode & SM_BYTESTREAM);
ASSERT(SOTOV(so)->v_type == VSOCK);
if (stp->sd_sidp != NULL &&
(error = straccess(stp, JCWRITE)) != 0)
return (error);
if (uiop == NULL) {
/*
* kstrwritemp() should have checked sd_flag and
* flow-control before coming here. If we end up
* here it means that we can simply pass down the
* data to tcp.
*/
ASSERT(mp != NULL);
if (stp->sd_wputdatafunc != NULL) {
newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
NULL, NULL, NULL);
if (newmp == NULL) {
/* The caller will free mp */
return (ECOMM);
}
mp = newmp;
}
tcp_wput(tcp_wq, mp);
return (0);
}
/* Fallback to strwrite() to do proper error handling */
if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
return (strwrite(SOTOV(so), uiop, cr));
rmax = stp->sd_qn_maxpsz;
ASSERT(rmax >= 0 || rmax == INFPSZ);
if (rmax == 0 || uiop->uio_resid <= 0)
return (0);
if (rmax == INFPSZ)
rmax = uiop->uio_resid;
maxblk = stp->sd_maxblk;
for (;;) {
iosize = MIN(uiop->uio_resid, rmax);
mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
if (mp == NULL) {
/*
* Fallback to strwrite() for ENOMEM; if this
* is our first time in this routine and the uio
* vector has not been modified, we will end up
* calling strwrite() without any flag set.
*/
if (error == ENOMEM)
goto slow_send;
else
return (error);
}
ASSERT(uiop->uio_resid >= 0);
/*
* If mp is non-NULL and ENOMEM is set, it means that
* mcopyinuio() was able to break down some of the user
* data into one or more mblks. Send the partial data
* to tcp and let the rest be handled in strwrite().
*/
ASSERT(error == 0 || error == ENOMEM);
if (stp->sd_wputdatafunc != NULL) {
newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
NULL, NULL, NULL);
if (newmp == NULL) {
/* The caller will free mp */
return (ECOMM);
}
mp = newmp;
}
tcp_wput(tcp_wq, mp);
wflag |= NOINTR;
if (uiop->uio_resid == 0) { /* No more data; we're done */
ASSERT(error == 0);
break;
} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
(STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
slow_send:
/*
* We were able to send down partial data using
* the direct call interface, but are now relying
* on strwrite() to handle the non-fastpath cases.
* If the socket is blocking we will sleep in
* strwaitq() until write is permitted, otherwise,
* we will need to return the amount of bytes
* written so far back to the app. This is the
* reason why we pass NOINTR flag to strwrite()
* for non-blocking socket, because we don't want
* to return EAGAIN when portion of the user data
* has actually been sent down.
*/
return (strwrite_common(SOTOV(so), uiop, cr, wflag));
}
}
return (0);
}
/*
* Update sti_faddr by asking the transport (unless AF_UNIX).
*/
/* ARGSUSED */
int
sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
boolean_t accept, struct cred *cr)
{
struct strbuf strbuf;
int error = 0, res;
void *addr;
t_uscalar_t addrlen;
k_sigset_t smask;
sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
(void *)so, pr_state(so->so_state, so->so_mode)));
ASSERT(*namelen > 0);
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
if (accept) {
bcopy(sti->sti_faddr_sa, name,
MIN(*namelen, sti->sti_faddr_len));
*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
goto done;
}
if (!(so->so_state & SS_ISCONNECTED)) {
error = ENOTCONN;
goto done;
}
/* Added this check for X/Open */
if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
error = EINVAL;
if (xnet_check_print) {
printf("sockfs: X/Open getpeername check => EINVAL\n");
}
goto done;
}
if (sti->sti_faddr_valid) {
bcopy(sti->sti_faddr_sa, name,
MIN(*namelen, sti->sti_faddr_len));
*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
goto done;
}
#ifdef DEBUG
dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
pr_addr(so->so_family, sti->sti_faddr_sa,
(t_uscalar_t)sti->sti_faddr_len)));
#endif /* DEBUG */
if (so->so_family == AF_UNIX) {
/* Transport has different name space - return local info */
if (sti->sti_faddr_noxlate)
*namelen = 0;
error = 0;
goto done;
}
ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
ASSERT(sti->sti_faddr_sa);
/* Allocate local buffer to use with ioctl */
addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
mutex_exit(&so->so_lock);
addr = kmem_alloc(addrlen, KM_SLEEP);
/*
* Issue TI_GETPEERNAME with signals masked.
* Put the result in sti_faddr_sa so that getpeername works after
* a shutdown(output).
* If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
* back to the socket.
*/
strbuf.buf = addr;
strbuf.maxlen = addrlen;
strbuf.len = 0;
sigintr(&smask, 0);
res = 0;
ASSERT(cr);
error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
0, K_TO_K, cr, &res);
sigunintr(&smask);
mutex_enter(&so->so_lock);
/*
* If there is an error record the error in so_error put don't fail
* the getpeername. Instead fallback on the recorded
* sti->sti_faddr_sa.
*/
if (error) {
/*
* Various stream head errors can be returned to the ioctl.
* However, it is impossible to determine which ones of
* these are really socket level errors that were incorrectly
* consumed by the ioctl. Thus this code silently ignores the
* error - to code explicitly does not reinstate the error
* using soseterror().
* Experiments have shows that at least this set of
* errors are reported and should not be reinstated on the
* socket:
* EINVAL E.g. if an I_LINK was in effect when
* getpeername was called.
* EPIPE The ioctl error semantics prefer the write
* side error over the read side error.
* ENOTCONN The transport just got disconnected but
* sockfs had not yet seen the T_DISCON_IND
* when issuing the ioctl.
*/
error = 0;
} else if (res == 0 && strbuf.len > 0 &&
(so->so_state & SS_ISCONNECTED)) {
ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
sti->sti_faddr_len = (socklen_t)strbuf.len;
bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
sti->sti_faddr_valid = 1;
bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
*namelen = sti->sti_faddr_len;
}
kmem_free(addr, addrlen);
#ifdef DEBUG
dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
pr_addr(so->so_family, sti->sti_faddr_sa,
(t_uscalar_t)sti->sti_faddr_len)));
#endif /* DEBUG */
done:
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
return (error);
}
/*
* Update sti_laddr by asking the transport (unless AF_UNIX).
*/
int
sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
struct cred *cr)
{
struct strbuf strbuf;
int error = 0, res;
void *addr;
t_uscalar_t addrlen;
k_sigset_t smask;
sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
(void *)so, pr_state(so->so_state, so->so_mode)));
ASSERT(*namelen > 0);
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
#ifdef DEBUG
dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
pr_addr(so->so_family, sti->sti_laddr_sa,
(t_uscalar_t)sti->sti_laddr_len)));
#endif /* DEBUG */
if (sti->sti_laddr_valid) {
bcopy(sti->sti_laddr_sa, name,
MIN(*namelen, sti->sti_laddr_len));
*namelen = sti->sti_laddr_len;
goto done;
}
if (so->so_family == AF_UNIX) {
/* Transport has different name space - return local info */
error = 0;
goto done;
}
if (!(so->so_state & SS_ISBOUND)) {
/* If not bound, then nothing to return. */
error = 0;
goto done;
}
/* Allocate local buffer to use with ioctl */
addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
mutex_exit(&so->so_lock);
addr = kmem_alloc(addrlen, KM_SLEEP);
/*
* Issue TI_GETMYNAME with signals masked.
* Put the result in sti_laddr_sa so that getsockname works after
* a shutdown(output).
* If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
* back to the socket.
*/
strbuf.buf = addr;
strbuf.maxlen = addrlen;
strbuf.len = 0;
sigintr(&smask, 0);
res = 0;
ASSERT(cr);
error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
0, K_TO_K, cr, &res);
sigunintr(&smask);
mutex_enter(&so->so_lock);
/*
* If there is an error record the error in so_error put don't fail
* the getsockname. Instead fallback on the recorded
* sti->sti_laddr_sa.
*/
if (error) {
/*
* Various stream head errors can be returned to the ioctl.
* However, it is impossible to determine which ones of
* these are really socket level errors that were incorrectly
* consumed by the ioctl. Thus this code silently ignores the
* error - to code explicitly does not reinstate the error
* using soseterror().
* Experiments have shows that at least this set of
* errors are reported and should not be reinstated on the
* socket:
* EINVAL E.g. if an I_LINK was in effect when
* getsockname was called.
* EPIPE The ioctl error semantics prefer the write
* side error over the read side error.
*/
error = 0;
} else if (res == 0 && strbuf.len > 0 &&
(so->so_state & SS_ISBOUND)) {
ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
sti->sti_laddr_len = (socklen_t)strbuf.len;
bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
sti->sti_laddr_valid = 1;
bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
*namelen = sti->sti_laddr_len;
}
kmem_free(addr, addrlen);
#ifdef DEBUG
dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
pr_addr(so->so_family, sti->sti_laddr_sa,
(t_uscalar_t)sti->sti_laddr_len)));
#endif /* DEBUG */
done:
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
return (error);
}
/*
* Get socket options. For SOL_SOCKET options some options are handled
* by the sockfs while others use the value recorded in the sonode as a
* fallback should the T_SVR4_OPTMGMT_REQ fail.
*
* On the return most *optlenp bytes are copied to optval.
*/
/* ARGSUSED */
int
sotpi_getsockopt(struct sonode *so, int level, int option_name,
void *optval, socklen_t *optlenp, int flags, struct cred *cr)
{
struct T_optmgmt_req optmgmt_req;
struct T_optmgmt_ack *optmgmt_ack;
struct opthdr oh;
struct opthdr *opt_res;
mblk_t *mp = NULL;
int error = 0;
void *option = NULL; /* Set if fallback value */
t_uscalar_t maxlen = *optlenp;
t_uscalar_t len;
uint32_t value;
struct timeval tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
struct so_snd_bufinfo snd_bufinfo; /* used for zero copy */
dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
(void *)so, level, option_name, optval, (void *)optlenp,
pr_state(so->so_state, so->so_mode)));
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
/*
* Check for SOL_SOCKET options.
* Certain SOL_SOCKET options are returned directly whereas
* others only provide a default (fallback) value should
* the T_SVR4_OPTMGMT_REQ fail.
*/
if (level == SOL_SOCKET) {
/* Check parameters */
switch (option_name) {
case SO_TYPE:
case SO_ERROR:
case SO_DEBUG:
case SO_ACCEPTCONN:
case SO_REUSEADDR:
case SO_KEEPALIVE:
case SO_DONTROUTE:
case SO_BROADCAST:
case SO_USELOOPBACK:
case SO_OOBINLINE:
case SO_SNDBUF:
case SO_RCVBUF:
#ifdef notyet
case SO_SNDLOWAT:
case SO_RCVLOWAT:
#endif /* notyet */
case SO_DOMAIN:
case SO_DGRAM_ERRIND:
if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
error = EINVAL;
eprintsoline(so, error);
goto done2;
}
break;
case SO_RCVTIMEO:
case SO_SNDTIMEO:
if (maxlen < (t_uscalar_t)sizeof (struct timeval)) {
error = EINVAL;
eprintsoline(so, error);
goto done2;
}
break;
case SO_LINGER:
if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
error = EINVAL;
eprintsoline(so, error);
goto done2;
}
break;
case SO_SND_BUFINFO:
if (maxlen < (t_uscalar_t)
sizeof (struct so_snd_bufinfo)) {
error = EINVAL;
eprintsoline(so, error);
goto done2;
}
break;
}
len = (t_uscalar_t)sizeof (uint32_t); /* Default */
switch (option_name) {
case SO_TYPE:
value = so->so_type;
option = &value;
goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
case SO_ERROR:
value = sogeterr(so, B_TRUE);
option = &value;
goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
case SO_ACCEPTCONN:
if (so->so_state & SS_ACCEPTCONN)
value = SO_ACCEPTCONN;
else
value = 0;
#ifdef DEBUG
if (value) {
dprintso(so, 1,
("sotpi_getsockopt: 0x%x is set\n",
option_name));
} else {
dprintso(so, 1,
("sotpi_getsockopt: 0x%x not set\n",
option_name));
}
#endif /* DEBUG */
option = &value;
goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
case SO_DEBUG:
case SO_REUSEADDR:
case SO_KEEPALIVE:
case SO_DONTROUTE:
case SO_BROADCAST:
case SO_USELOOPBACK:
case SO_OOBINLINE:
case SO_DGRAM_ERRIND:
value = (so->so_options & option_name);
#ifdef DEBUG
if (value) {
dprintso(so, 1,
("sotpi_getsockopt: 0x%x is set\n",
option_name));
} else {
dprintso(so, 1,
("sotpi_getsockopt: 0x%x not set\n",
option_name));
}
#endif /* DEBUG */
option = &value;
goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
/*
* The following options are only returned by sockfs when the
* T_SVR4_OPTMGMT_REQ fails.
*/
case SO_LINGER:
option = &so->so_linger;
len = (t_uscalar_t)sizeof (struct linger);
break;
case SO_SNDBUF: {
ssize_t lvalue;
/*
* If the option has not been set then get a default
* value from the read queue. This value is
* returned if the transport fails
* the T_SVR4_OPTMGMT_REQ.
*/
lvalue = so->so_sndbuf;
if (lvalue == 0) {
mutex_exit(&so->so_lock);
(void) strqget(strvp2wq(SOTOV(so))->q_next,
QHIWAT, 0, &lvalue);
mutex_enter(&so->so_lock);
dprintso(so, 1,
("got SO_SNDBUF %ld from q\n", lvalue));
}
value = (int)lvalue;
option = &value;
len = (t_uscalar_t)sizeof (so->so_sndbuf);
break;
}
case SO_RCVBUF: {
ssize_t lvalue;
/*
* If the option has not been set then get a default
* value from the read queue. This value is
* returned if the transport fails
* the T_SVR4_OPTMGMT_REQ.
*
* XXX If SO_RCVBUF has been set and this is an
* XPG 4.2 application then do not ask the transport
* since the transport might adjust the value and not
* return exactly what was set by the application.
* For non-XPG 4.2 application we return the value
* that the transport is actually using.
*/
lvalue = so->so_rcvbuf;
if (lvalue == 0) {
mutex_exit(&so->so_lock);
(void) strqget(RD(strvp2wq(SOTOV(so))),
QHIWAT, 0, &lvalue);
mutex_enter(&so->so_lock);
dprintso(so, 1,
("got SO_RCVBUF %ld from q\n", lvalue));
} else if (flags & _SOGETSOCKOPT_XPG4_2) {
value = (int)lvalue;
option = &value;
goto copyout; /* skip asking transport */
}
value = (int)lvalue;
option = &value;
len = (t_uscalar_t)sizeof (so->so_rcvbuf);
break;
}
case SO_DOMAIN:
value = so->so_family;
option = &value;
goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
#ifdef notyet
/*
* We do not implement the semantics of these options
* thus we shouldn't implement the options either.
*/
case SO_SNDLOWAT:
value = so->so_sndlowat;
option = &value;
break;
case SO_RCVLOWAT:
value = so->so_rcvlowat;
option = &value;
break;
#endif /* notyet */
case SO_SNDTIMEO:
case SO_RCVTIMEO: {
clock_t val;
if (option_name == SO_RCVTIMEO)
val = drv_hztousec(so->so_rcvtimeo);
else
val = drv_hztousec(so->so_sndtimeo);
tmo_val.tv_sec = val / (1000 * 1000);
tmo_val.tv_usec = val % (1000 * 1000);
option = &tmo_val;
len = (t_uscalar_t)sizeof (struct timeval);
break;
}
case SO_SND_BUFINFO: {
snd_bufinfo.sbi_wroff =
(so->so_proto_props).sopp_wroff;
snd_bufinfo.sbi_maxblk =
(so->so_proto_props).sopp_maxblk;
snd_bufinfo.sbi_maxpsz =
(so->so_proto_props).sopp_maxpsz;
snd_bufinfo.sbi_tail =
(so->so_proto_props).sopp_tail;
option = &snd_bufinfo;
len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
break;
}
}
}
mutex_exit(&so->so_lock);
/* Send request */
optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
optmgmt_req.MGMT_flags = T_CHECK;
optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
oh.level = level;
oh.name = option_name;
oh.len = maxlen;
mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
&oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP);
/* Let option management work in the presence of data flow control */
error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
mp = NULL;
mutex_enter(&so->so_lock);
if (error) {
eprintsoline(so, error);
goto done2;
}
error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
(t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
if (error) {
if (option != NULL) {
/* We have a fallback value */
error = 0;
goto copyout;
}
eprintsoline(so, error);
goto done2;
}
ASSERT(mp);
optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
if (opt_res == NULL) {
if (option != NULL) {
/* We have a fallback value */
error = 0;
goto copyout;
}
error = EPROTO;
eprintsoline(so, error);
goto done;
}
option = &opt_res[1];
/* check to ensure that the option is within bounds */
if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
(uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
if (option != NULL) {
/* We have a fallback value */
error = 0;
goto copyout;
}
error = EPROTO;
eprintsoline(so, error);
goto done;
}
len = opt_res->len;
copyout: {
t_uscalar_t size = MIN(len, maxlen);
bcopy(option, optval, size);
bcopy(&size, optlenp, sizeof (size));
}
done:
freemsg(mp);
done2:
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
return (error);
}
/*
* Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
* SOL_SOCKET options are also recorded in the sonode. A setsockopt for
* SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
* setsockopt has to work even if the transport does not support the option.
*/
/* ARGSUSED */
int
sotpi_setsockopt(struct sonode *so, int level, int option_name,
const void *optval, t_uscalar_t optlen, struct cred *cr)
{
struct T_optmgmt_req optmgmt_req;
struct opthdr oh;
mblk_t *mp;
int error = 0;
boolean_t handled = B_FALSE;
dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
(void *)so, level, option_name, optval, optlen,
pr_state(so->so_state, so->so_mode)));
/* X/Open requires this check */
if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
if (xnet_check_print)
printf("sockfs: X/Open setsockopt check => EINVAL\n");
return (EINVAL);
}
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
mutex_exit(&so->so_lock);
/*
* For SOCKET or TCP level options, try to set it here itself
* provided socket has not been popped and we know the tcp
* structure (stored in so_priv).
*/
if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
(so->so_family == AF_INET || so->so_family == AF_INET6) &&
(so->so_version == SOV_SOCKSTREAM) &&
(so->so_proto_handle != NULL)) {
tcp_t *tcp = (tcp_t *)so->so_proto_handle;
boolean_t onoff;
#define intvalue (*(int32_t *)optval)
switch (level) {
case SOL_SOCKET:
switch (option_name) { /* Check length param */
case SO_DEBUG:
case SO_REUSEADDR:
case SO_DONTROUTE:
case SO_BROADCAST:
case SO_USELOOPBACK:
case SO_OOBINLINE:
case SO_DGRAM_ERRIND:
if (optlen != (t_uscalar_t)sizeof (int32_t)) {
error = EINVAL;
eprintsoline(so, error);
mutex_enter(&so->so_lock);
goto done2;
}
ASSERT(optval);
onoff = intvalue != 0;
handled = B_TRUE;
break;
case SO_SNDTIMEO:
case SO_RCVTIMEO:
if (optlen !=
(t_uscalar_t)sizeof (struct timeval)) {
error = EINVAL;
eprintsoline(so, error);
mutex_enter(&so->so_lock);
goto done2;
}
ASSERT(optval);
handled = B_TRUE;
break;
case SO_LINGER:
if (optlen !=
(t_uscalar_t)sizeof (struct linger)) {
error = EINVAL;
eprintsoline(so, error);
mutex_enter(&so->so_lock);
goto done2;
}
ASSERT(optval);
handled = B_TRUE;
break;
}
switch (option_name) { /* Do actions */
case SO_LINGER: {
struct linger *lgr = (struct linger *)optval;
if (lgr->l_onoff) {
tcp->tcp_linger = 1;
tcp->tcp_lingertime = lgr->l_linger;
so->so_linger.l_onoff = SO_LINGER;
so->so_options |= SO_LINGER;
} else {
tcp->tcp_linger = 0;
tcp->tcp_lingertime = 0;
so->so_linger.l_onoff = 0;
so->so_options &= ~SO_LINGER;
}
so->so_linger.l_linger = lgr->l_linger;
handled = B_TRUE;
break;
}
case SO_DEBUG:
tcp->tcp_debug = onoff;
#ifdef SOCK_TEST
if (intvalue & 2)
sock_test_timelimit = 10 * hz;
else
sock_test_timelimit = 0;
if (intvalue & 4)
do_useracc = 0;
else
do_useracc = 1;
#endif /* SOCK_TEST */
break;
case SO_DONTROUTE:
/*
* SO_DONTROUTE, SO_USELOOPBACK and
* SO_BROADCAST are only of interest to IP.
* We track them here only so
* that we can report their current value.
*/
tcp->tcp_dontroute = onoff;
if (onoff)
so->so_options |= option_name;
else
so->so_options &= ~option_name;
break;
case SO_USELOOPBACK:
tcp->tcp_useloopback = onoff;
if (onoff)
so->so_options |= option_name;
else
so->so_options &= ~option_name;
break;
case SO_BROADCAST:
tcp->tcp_broadcast = onoff;
if (onoff)
so->so_options |= option_name;
else
so->so_options &= ~option_name;
break;
case SO_REUSEADDR:
tcp->tcp_reuseaddr = onoff;
if (onoff)
so->so_options |= option_name;
else
so->so_options &= ~option_name;
break;
case SO_OOBINLINE:
tcp->tcp_oobinline = onoff;
if (onoff)
so->so_options |= option_name;
else
so->so_options &= ~option_name;
break;
case SO_DGRAM_ERRIND:
tcp->tcp_dgram_errind = onoff;
if (onoff)
so->so_options |= option_name;
else
so->so_options &= ~option_name;
break;
}
break;
case IPPROTO_TCP:
switch (option_name) {
case TCP_NODELAY:
if (optlen != (t_uscalar_t)sizeof (int32_t)) {
error = EINVAL;
eprintsoline(so, error);
mutex_enter(&so->so_lock);
goto done2;
}
ASSERT(optval);
tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
handled = B_TRUE;
break;
}
break;
default:
handled = B_FALSE;
break;
}
}
if (handled) {
mutex_enter(&so->so_lock);
goto done2;
}
optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
optmgmt_req.MGMT_flags = T_NEGOTIATE;
optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
oh.level = level;
oh.name = option_name;
oh.len = optlen;
mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
&oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP);
/* Let option management work in the presence of data flow control */
error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
mp = NULL;
mutex_enter(&so->so_lock);
if (error) {
eprintsoline(so, error);
goto done2;
}
error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
(t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
if (error) {
eprintsoline(so, error);
goto done;
}
ASSERT(mp);
/* No need to verify T_optmgmt_ack */
freemsg(mp);
done:
/*
* Check for SOL_SOCKET options and record their values.
* If we know about a SOL_SOCKET parameter and the transport
* failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
* EPROTO) we let the setsockopt succeed.
*/
if (level == SOL_SOCKET) {
/* Check parameters */
switch (option_name) {
case SO_DEBUG:
case SO_REUSEADDR:
case SO_KEEPALIVE:
case SO_DONTROUTE:
case SO_BROADCAST:
case SO_USELOOPBACK:
case SO_OOBINLINE:
case SO_SNDBUF:
case SO_RCVBUF:
#ifdef notyet
case SO_SNDLOWAT:
case SO_RCVLOWAT:
#endif /* notyet */
case SO_DGRAM_ERRIND:
if (optlen != (t_uscalar_t)sizeof (int32_t)) {
error = EINVAL;
eprintsoline(so, error);
goto done2;
}
ASSERT(optval);
handled = B_TRUE;
break;
case SO_SNDTIMEO:
case SO_RCVTIMEO:
if (optlen != (t_uscalar_t)sizeof (struct timeval)) {
error = EINVAL;
eprintsoline(so, error);
goto done2;
}
ASSERT(optval);
handled = B_TRUE;
break;
case SO_LINGER:
if (optlen != (t_uscalar_t)sizeof (struct linger)) {
error = EINVAL;
eprintsoline(so, error);
goto done2;
}
ASSERT(optval);
handled = B_TRUE;
break;
}
#define intvalue (*(int32_t *)optval)
switch (option_name) {
case SO_TYPE:
case SO_ERROR:
case SO_ACCEPTCONN:
/* Can't be set */
error = ENOPROTOOPT;
goto done2;
case SO_LINGER: {
struct linger *l = (struct linger *)optval;
so->so_linger.l_linger = l->l_linger;
if (l->l_onoff) {
so->so_linger.l_onoff = SO_LINGER;
so->so_options |= SO_LINGER;
} else {
so->so_linger.l_onoff = 0;
so->so_options &= ~SO_LINGER;
}
break;
}
case SO_DEBUG:
#ifdef SOCK_TEST
if (intvalue & 2)
sock_test_timelimit = 10 * hz;
else
sock_test_timelimit = 0;
if (intvalue & 4)
do_useracc = 0;
else
do_useracc = 1;
#endif /* SOCK_TEST */
/* FALLTHRU */
case SO_REUSEADDR:
case SO_KEEPALIVE:
case SO_DONTROUTE:
case SO_BROADCAST:
case SO_USELOOPBACK:
case SO_OOBINLINE:
case SO_DGRAM_ERRIND:
if (intvalue != 0) {
dprintso(so, 1,
("socket_setsockopt: setting 0x%x\n",
option_name));
so->so_options |= option_name;
} else {
dprintso(so, 1,
("socket_setsockopt: clearing 0x%x\n",
option_name));
so->so_options &= ~option_name;
}
break;
/*
* The following options are only returned by us when the
* transport layer fails.
* XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
* since the transport might adjust the value and not
* return exactly what was set by the application.
*/
case SO_SNDBUF:
so->so_sndbuf = intvalue;
break;
case SO_RCVBUF:
so->so_rcvbuf = intvalue;
break;
case SO_RCVPSH:
so->so_rcv_timer_interval = intvalue;
break;
#ifdef notyet
/*
* We do not implement the semantics of these options
* thus we shouldn't implement the options either.
*/
case SO_SNDLOWAT:
so->so_sndlowat = intvalue;
break;
case SO_RCVLOWAT:
so->so_rcvlowat = intvalue;
break;
#endif /* notyet */
case SO_SNDTIMEO:
case SO_RCVTIMEO: {
struct timeval *tl = (struct timeval *)optval;
clock_t val = tl->tv_sec * 1000 * 1000 + tl->tv_usec;
if (option_name == SO_RCVTIMEO)
so->so_rcvtimeo = drv_usectohz(val);
else
so->so_sndtimeo = drv_usectohz(val);
break;
}
}
#undef intvalue
if (error) {
if ((error == ENOPROTOOPT || error == EPROTO ||
error == EINVAL) && handled) {
dprintso(so, 1,
("setsockopt: ignoring error %d for 0x%x\n",
error, option_name));
error = 0;
}
}
}
done2:
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
return (error);
}
/*
* sotpi_close() is called when the last open reference goes away.
*/
/* ARGSUSED */
int
sotpi_close(struct sonode *so, int flag, struct cred *cr)
{
struct vnode *vp = SOTOV(so);
dev_t dev;
int error = 0;
sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
(void *)vp, flag, pr_state(so->so_state, so->so_mode)));
dev = sti->sti_dev;
ASSERT(STREAMSTAB(getmajor(dev)));
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
ASSERT(so_verify_oobstate(so));
if (sti->sti_nl7c_flags & NL7C_ENABLED) {
sti->sti_nl7c_flags = 0;
nl7c_close(so);
}
if (vp->v_stream != NULL) {
vnode_t *ux_vp;
if (so->so_family == AF_UNIX) {
/* Could avoid this when CANTSENDMORE for !dgram */
so_unix_close(so);
}
mutex_exit(&so->so_lock);
/*
* Disassemble the linkage from the AF_UNIX underlying file
* system vnode to this socket (by atomically clearing
* v_stream in vn_rele_stream) before strclose clears sd_vnode
* and frees the stream head.
*/
if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
ASSERT(ux_vp->v_stream);
sti->sti_ux_bound_vp = NULL;
vn_rele_stream(ux_vp);
}
if (so->so_family == AF_INET || so->so_family == AF_INET6) {
strsetrwputdatahooks(SOTOV(so), NULL, NULL);
if (sti->sti_kssl_ent != NULL) {
kssl_release_ent(sti->sti_kssl_ent, so,
sti->sti_kssl_type);
sti->sti_kssl_ent = NULL;
}
if (sti->sti_kssl_ctx != NULL) {
kssl_release_ctx(sti->sti_kssl_ctx);
sti->sti_kssl_ctx = NULL;
}
sti->sti_kssl_type = KSSL_NO_PROXY;
}
error = strclose(vp, flag, cr);
vp->v_stream = NULL;
mutex_enter(&so->so_lock);
}
/*
* Flush the T_DISCON_IND on sti_discon_ind_mp.
*/
so_flush_discon_ind(so);
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
/*
* Needed for STREAMs.
* Decrement the device driver's reference count for streams
* opened via the clone dip. The driver was held in clone_open().
* The absence of clone_close() forces this asymmetry.
*/
if (so->so_flag & SOCLONE)
ddi_rele_driver(getmajor(dev));
return (error);
}
static int
sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
struct cred *cr, int32_t *rvalp)
{
struct vnode *vp = SOTOV(so);
sotpi_info_t *sti = SOTOTPI(so);
int error = 0;
dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
cmd, arg, pr_state(so->so_state, so->so_mode)));
switch (cmd) {
case _I_INSERT:
case _I_REMOVE:
/*
* Since there's no compelling reason to support these ioctls
* on sockets, and doing so would increase the complexity
* markedly, prevent it.
*/
return (EOPNOTSUPP);
case I_FIND:
case I_LIST:
case I_LOOK:
case I_POP:
case I_PUSH:
/*
* To prevent races and inconsistencies between the actual
* state of the stream and the state according to the sonode,
* we serialize all operations which modify or operate on the
* list of modules on the socket's stream.
*/
mutex_enter(&sti->sti_plumb_lock);
error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
mutex_exit(&sti->sti_plumb_lock);
return (error);
default:
if (so->so_version != SOV_STREAM)
break;
/*
* The imaginary "sockmod" has been popped; act as a stream.
*/
return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
}
ASSERT(so->so_version != SOV_STREAM);
/*
* Process socket-specific ioctls.
*/
switch (cmd) {
case FIONBIO: {
int32_t value;
if (so_copyin((void *)arg, &value, sizeof (int32_t),
(mode & (int)FKIOCTL)))
return (EFAULT);
mutex_enter(&so->so_lock);
if (value) {
so->so_state |= SS_NDELAY;
} else {
so->so_state &= ~SS_NDELAY;
}
mutex_exit(&so->so_lock);
return (0);
}
case FIOASYNC: {
int32_t value;
if (so_copyin((void *)arg, &value, sizeof (int32_t),
(mode & (int)FKIOCTL)))
return (EFAULT);
mutex_enter(&so->so_lock);
/*
* SS_ASYNC flag not already set correctly?
* (!value != !(so->so_state & SS_ASYNC))
* but some engineers find that too hard to read.
*/
if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
value != 0 && (so->so_state & SS_ASYNC) == 0)
error = so_flip_async(so, vp, mode, cr);
mutex_exit(&so->so_lock);
return (error);
}
case SIOCSPGRP:
case FIOSETOWN: {
pid_t pgrp;
if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
(mode & (int)FKIOCTL)))
return (EFAULT);
mutex_enter(&so->so_lock);
dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
/* Any change? */
if (pgrp != so->so_pgrp)
error = so_set_siggrp(so, vp, pgrp, mode, cr);
mutex_exit(&so->so_lock);
return (error);
}
case SIOCGPGRP:
case FIOGETOWN:
if (so_copyout(&so->so_pgrp, (void *)arg,
sizeof (pid_t), (mode & (int)FKIOCTL)))
return (EFAULT);
return (0);
case SIOCATMARK: {
int retval;
uint_t so_state;
/*
* strwaitmark has a finite timeout after which it
* returns -1 if the mark state is undetermined.
* In order to avoid any race between the mark state
* in sockfs and the mark state in the stream head this
* routine loops until the mark state can be determined
* (or the urgent data indication has been removed by some
* other thread).
*/
do {
mutex_enter(&so->so_lock);
so_state = so->so_state;
mutex_exit(&so->so_lock);
if (so_state & SS_RCVATMARK) {
retval = 1;
} else if (!(so_state & SS_OOBPEND)) {
/*
* No SIGURG has been generated -- there is no
* pending or present urgent data. Thus can't
* possibly be at the mark.
*/
retval = 0;
} else {
/*
* Have the stream head wait until there is
* either some messages on the read queue, or
* STRATMARK or STRNOTATMARK gets set. The
* STRNOTATMARK flag is used so that the
* transport can send up a MSGNOTMARKNEXT
* M_DATA to indicate that it is not
* at the mark and additional data is not about
* to be send upstream.
*
* If the mark state is undetermined this will
* return -1 and we will loop rechecking the
* socket state.
*/
retval = strwaitmark(vp);
}
} while (retval == -1);
if (so_copyout(&retval, (void *)arg, sizeof (int),
(mode & (int)FKIOCTL)))
return (EFAULT);
return (0);
}
case I_FDINSERT:
case I_SENDFD:
case I_RECVFD:
case I_ATMARK:
case _SIOCSOCKFALLBACK:
/*
* These ioctls do not apply to sockets. I_FDINSERT can be
* used to send M_PROTO messages without modifying the socket
* state. I_SENDFD/RECVFD should not be used for socket file
* descriptor passing since they assume a twisted stream.
* SIOCATMARK must be used instead of I_ATMARK.
*
* _SIOCSOCKFALLBACK from an application should never be
* processed. It is only generated by socktpi_open() or
* in response to I_POP or I_PUSH.
*/
#ifdef DEBUG
zcmn_err(getzoneid(), CE_WARN,
"Unsupported STREAMS ioctl 0x%x on socket. "
"Pid = %d\n", cmd, curproc->p_pid);
#endif /* DEBUG */
return (EOPNOTSUPP);
case _I_GETPEERCRED:
if ((mode & FKIOCTL) == 0)
return (EINVAL);
mutex_enter(&so->so_lock);
if ((so->so_mode & SM_CONNREQUIRED) == 0) {
error = ENOTSUP;
} else if ((so->so_state & SS_ISCONNECTED) == 0) {
error = ENOTCONN;
} else if (so->so_peercred != NULL) {
k_peercred_t *kp = (k_peercred_t *)arg;
kp->pc_cr = so->so_peercred;
kp->pc_cpid = so->so_cpid;
crhold(so->so_peercred);
} else {
error = EINVAL;
}
mutex_exit(&so->so_lock);
return (error);
default:
/*
* Do the higher-order bits of the ioctl cmd indicate
* that it is an I_* streams ioctl?
*/
if ((cmd & 0xffffff00U) == STR &&
so->so_version == SOV_SOCKBSD) {
#ifdef DEBUG
zcmn_err(getzoneid(), CE_WARN,
"Unsupported STREAMS ioctl 0x%x on socket. "
"Pid = %d\n", cmd, curproc->p_pid);
#endif /* DEBUG */
return (EOPNOTSUPP);
}
return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
}
}
/*
* Handle plumbing-related ioctls.
*/
static int
socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
struct cred *cr, int32_t *rvalp)
{
static const char sockmod_name[] = "sockmod";
struct sonode *so = VTOSO(vp);
char mname[FMNAMESZ + 1];
int error;
sotpi_info_t *sti = SOTOTPI(so);
ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
if (so->so_version == SOV_SOCKBSD)
return (EOPNOTSUPP);
if (so->so_version == SOV_STREAM) {
/*
* The imaginary "sockmod" has been popped - act as a stream.
* If this is a push of sockmod then change back to a socket.
*/
if (cmd == I_PUSH) {
error = ((mode & FKIOCTL) ? copystr : copyinstr)(
(void *)arg, mname, sizeof (mname), NULL);
if (error == 0 && strcmp(mname, sockmod_name) == 0) {
dprintso(so, 0, ("socktpi_ioctl: going to "
"socket version\n"));
so_stream2sock(so);
return (0);
}
}
return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
}
switch (cmd) {
case I_PUSH:
if (sti->sti_direct) {
mutex_enter(&so->so_lock);
so_lock_single(so);
mutex_exit(&so->so_lock);
error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
CRED(), rvalp);
mutex_enter(&so->so_lock);
if (error == 0)
sti->sti_direct = 0;
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
if (error != 0)
return (error);
}
error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
if (error == 0)
sti->sti_pushcnt++;
return (error);
case I_POP:
if (sti->sti_pushcnt == 0) {
/* Emulate sockmod being popped */
dprintso(so, 0,
("socktpi_ioctl: going to STREAMS version\n"));
return (so_sock2stream(so));
}
error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
if (error == 0)
sti->sti_pushcnt--;
return (error);
case I_LIST: {
struct str_mlist *kmlistp, *umlistp;
struct str_list kstrlist;
ssize_t kstrlistsize;
int i, nmods;
STRUCT_DECL(str_list, ustrlist);
STRUCT_INIT(ustrlist, mode);
if (arg == NULL) {
error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
if (error == 0)
(*rvalp)++; /* Add one for sockmod */
return (error);
}
error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
STRUCT_SIZE(ustrlist), mode & FKIOCTL);
if (error != 0)
return (error);
nmods = STRUCT_FGET(ustrlist, sl_nmods);
if (nmods <= 0)
return (EINVAL);
/*
* Ceiling nmods at nstrpush to prevent someone from
* maliciously consuming lots of kernel memory.
*/
nmods = MIN(nmods, nstrpush);
kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
kstrlist.sl_nmods = nmods;
kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
cr, rvalp);
if (error != 0)
goto done;
/*
* Considering the module list as a 0-based array of sl_nmods
* modules, sockmod should conceptually exist at slot
* sti_pushcnt. Insert sockmod at this location by sliding all
* of the module names after so_pushcnt over by one. We know
* that there will be room to do this since we allocated
* sl_modlist with an additional slot.
*/
for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
kstrlist.sl_nmods++;
/*
* Copy all of the entries out to ustrlist.
*/
kmlistp = kstrlist.sl_modlist;
umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
error = so_copyout(kmlistp++, umlistp++,
sizeof (struct str_mlist), mode & FKIOCTL);
if (error != 0)
goto done;
}
error = so_copyout(&i, (void *)arg, sizeof (int32_t),
mode & FKIOCTL);
if (error == 0)
*rvalp = 0;
done:
kmem_free(kstrlist.sl_modlist, kstrlistsize);
return (error);
}
case I_LOOK:
if (sti->sti_pushcnt == 0) {
return (so_copyout(sockmod_name, (void *)arg,
sizeof (sockmod_name), mode & FKIOCTL));
}
return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
case I_FIND:
error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
if (error && error != EINVAL)
return (error);
/* if not found and string was sockmod return 1 */
if (*rvalp == 0 || error == EINVAL) {
error = ((mode & FKIOCTL) ? copystr : copyinstr)(
(void *)arg, mname, sizeof (mname), NULL);
if (error == ENAMETOOLONG)
error = EINVAL;
if (error == 0 && strcmp(mname, sockmod_name) == 0)
*rvalp = 1;
}
return (error);
default:
panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
break;
}
return (0);
}
/*
* Wrapper around the streams poll routine that implements socket poll
* semantics.
* The sockfs never calls pollwakeup itself - the stream head take care
* of all pollwakeups. Since sockfs never holds so_lock when calling the
* stream head there can never be a deadlock due to holding so_lock across
* pollwakeup and acquiring so_lock in this routine.
*
* However, since the performance of VOP_POLL is critical we avoid
* acquiring so_lock here. This is based on two assumptions:
* - The poll implementation holds locks to serialize the VOP_POLL call
* and a pollwakeup for the same pollhead. This ensures that should
* e.g. so_state change during a socktpi_poll call the pollwakeup
* (which strsock_* and strrput conspire to issue) is issued after
* the state change. Thus the pollwakeup will block until VOP_POLL has
* returned and then wake up poll and have it call VOP_POLL again.
* - The reading of so_state without holding so_lock does not result in
* stale data that is older than the latest state change that has dropped
* so_lock. This is ensured by the mutex_exit issuing the appropriate
* memory barrier to force the data into the coherency domain.
*/
static int
sotpi_poll(
struct sonode *so,
short events,
int anyyet,
short *reventsp,
struct pollhead **phpp)
{
short origevents = events;
struct vnode *vp = SOTOV(so);
int error;
int so_state = so->so_state; /* snapshot */
sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
(void *)vp, pr_state(so_state, so->so_mode), so->so_error));
ASSERT(vp->v_type == VSOCK);
ASSERT(vp->v_stream != NULL);
if (so->so_version == SOV_STREAM) {
/* The imaginary "sockmod" has been popped - act as a stream */
return (strpoll(vp->v_stream, events, anyyet,
reventsp, phpp));
}
if (!(so_state & SS_ISCONNECTED) &&
(so->so_mode & SM_CONNREQUIRED)) {
/* Not connected yet - turn off write side events */
events &= ~(POLLOUT|POLLWRBAND);
}
/*
* Check for errors without calling strpoll if the caller wants them.
* In sockets the errors are represented as input/output events
* and there is no need to ask the stream head for this information.
*/
if (so->so_error != 0 &&
((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
return (0);
}
/*
* Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
* These message with only an M_PROTO/M_PCPROTO part and no M_DATA
* will not trigger a POLLIN event with POLLRDDATA set.
* The handling of urgent data (causing POLLRDBAND) is done by
* inspecting SS_OOBPEND below.
*/
events |= POLLRDDATA;
/*
* After shutdown(output) a stream head write error is set.
* However, we should not return output events.
*/
events |= POLLNOERR;
error = strpoll(vp->v_stream, events, anyyet,
reventsp, phpp);
if (error)
return (error);
ASSERT(!(*reventsp & POLLERR));
/*
* Notes on T_CONN_IND handling for sockets.
*
* If strpoll() returned without events, SR_POLLIN is guaranteed
* to be set, ensuring any subsequent strrput() runs pollwakeup().
*
* Since the so_lock is not held, soqueueconnind() may have run
* and a T_CONN_IND may be waiting. We now check for any queued
* T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
* to ensure poll returns.
*
* However:
* If the T_CONN_IND hasn't arrived by the time strpoll() returns,
* when strrput() does run for an arriving M_PROTO with T_CONN_IND
* the following actions will occur; taken together they ensure the
* syscall will return.
*
* 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
* the accept() was run on a non-blocking socket sowaitconnind()
* may have already returned EWOULDBLOCK, so not be waiting to
* process the message. Additionally socktpi_poll() has probably
* proceeded past the sti_conn_ind_head check below.
* 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
* this thread, however that could occur before poll_common()
* has entered cv_wait.
* 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
*
* Before proceeding to cv_wait() in poll_common() for an event,
* poll_common() atomically checks for T_POLLWAKE under the pc_lock,
* and if set, re-calls strpoll() to ensure the late arriving
* T_CONN_IND is recognized, and pollsys() returns.
*/
if (sti->sti_conn_ind_head != NULL)
*reventsp |= (POLLIN|POLLRDNORM) & events;
if (so->so_state & SS_OOBPEND)
*reventsp |= POLLRDBAND & events;
if (sti->sti_nl7c_rcv_mp != NULL) {
*reventsp |= (POLLIN|POLLRDNORM) & events;
}
if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
((POLLIN|POLLRDNORM) & *reventsp)) {
sti->sti_nl7c_flags |= NL7C_POLLIN;
}
return (0);
}
/*ARGSUSED*/
static int
socktpi_constructor(void *buf, void *cdrarg, int kmflags)
{
sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
int error = 0;
error = sonode_constructor(buf, cdrarg, kmflags);
if (error != 0)
return (error);
error = i_sotpi_info_constructor(&st->st_info);
if (error != 0)
sonode_destructor(buf, cdrarg);
st->st_sonode.so_priv = &st->st_info;
return (error);
}
/*ARGSUSED1*/
static void
socktpi_destructor(void *buf, void *cdrarg)
{
sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
ASSERT(st->st_sonode.so_priv == &st->st_info);
st->st_sonode.so_priv = NULL;
i_sotpi_info_destructor(&st->st_info);
sonode_destructor(buf, cdrarg);
}
static int
socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
{
int retval;
if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
struct sonode *so = (struct sonode *)buf;
sotpi_info_t *sti = SOTOTPI(so);
mutex_enter(&socklist.sl_lock);
sti->sti_next_so = socklist.sl_list;
sti->sti_prev_so = NULL;
if (sti->sti_next_so != NULL)
SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
socklist.sl_list = so;
mutex_exit(&socklist.sl_lock);
}
return (retval);
}
static void
socktpi_unix_destructor(void *buf, void *cdrarg)
{
struct sonode *so = (struct sonode *)buf;
sotpi_info_t *sti = SOTOTPI(so);
mutex_enter(&socklist.sl_lock);
if (sti->sti_next_so != NULL)
SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
if (sti->sti_prev_so != NULL)
SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
else
socklist.sl_list = sti->sti_next_so;
mutex_exit(&socklist.sl_lock);
socktpi_destructor(buf, cdrarg);
}
int
socktpi_init(void)
{
/*
* Create sonode caches. We create a special one for AF_UNIX so
* that we can track them for netstat(1m).
*/
socktpi_cache = kmem_cache_create("socktpi_cache",
sizeof (struct sotpi_sonode), 0, socktpi_constructor,
socktpi_destructor, NULL, NULL, NULL, 0);
socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
socktpi_unix_destructor, NULL, NULL, NULL, 0);
return (0);
}
/*
* Given a non-TPI sonode, allocate and prep it to be ready for TPI.
*
* Caller must still update state and mode using sotpi_update_state().
*
* Returns the STREAM queue that the protocol should use.
*/
queue_t *
sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
boolean_t *direct, struct cred *cr)
{
sotpi_info_t *sti;
struct sockparams *origsp = so->so_sockparams;
sock_lower_handle_t handle = so->so_proto_handle;
uint_t old_state = so->so_state;
struct stdata *stp;
struct vnode *vp;
queue_t *q;
*direct = B_FALSE;
so->so_sockparams = newsp;
/*
* Allocate and initalize fields required by TPI.
*/
(void) sotpi_info_create(so, KM_SLEEP);
sotpi_info_init(so);
if (sotpi_init(so, NULL, cr, SO_FALLBACK) != 0) {
sotpi_info_fini(so);
sotpi_info_destroy(so);
so->so_state = old_state;
return (NULL);
}
ASSERT(handle == so->so_proto_handle);
sti = SOTOTPI(so);
if (sti->sti_direct != 0)
*direct = B_TRUE;
/*
* Keep the original sp around so we can properly dispose of the
* sonode when the socket is being closed.
*/
sti->sti_orig_sp = origsp;
so_basic_strinit(so); /* skips the T_CAPABILITY_REQ */
so_alloc_addr(so, so->so_max_addr_len);
/*
* If the application has done a SIOCSPGRP, make sure the
* STREAM head is aware. This needs to take place before
* the protocol start sending up messages. Otherwise we
* might miss to generate SIGPOLL.
*
* It is possible that the application will receive duplicate
* signals if some were already generated for either data or
* connection indications.
*/
if (so->so_pgrp != 0) {
mutex_enter(&so->so_lock);
if (so_set_events(so, so->so_vnode, cr) != 0)
so->so_pgrp = 0;
mutex_exit(&so->so_lock);
}
/*
* Determine which queue to use.
*/
vp = SOTOV(so);
stp = vp->v_stream;
ASSERT(stp != NULL);
q = stp->sd_wrq->q_next;
/*
* Skip any modules that may have been auto pushed when the device
* was opened
*/
while (q->q_next != NULL)
q = q->q_next;
q = _RD(q);
return (q);
}
void
sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
socklen_t faddrlen, short opts)
{
sotpi_info_t *sti = SOTOTPI(so);
so_proc_tcapability_ack(so, tcap);
so->so_options |= opts;
/*
* Determine whether the foreign and local address are valid
*/
if (laddrlen != 0) {
ASSERT(laddrlen <= sti->sti_laddr_maxlen);
sti->sti_laddr_len = laddrlen;
bcopy(laddr, sti->sti_laddr_sa, laddrlen);
sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
}
if (faddrlen != 0) {
ASSERT(faddrlen <= sti->sti_faddr_maxlen);
sti->sti_faddr_len = faddrlen;
bcopy(faddr, sti->sti_faddr_sa, faddrlen);
sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
}
}
/*
* Allocate enough space to cache the local and foreign addresses.
*/
void
so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
{
sotpi_info_t *sti = SOTOTPI(so);
ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
P2ROUNDUP(maxlen, KMEM_ALIGN);
so->so_max_addr_len = sti->sti_laddr_maxlen;
sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
+ sti->sti_laddr_maxlen);
if (so->so_family == AF_UNIX) {
/*
* Initialize AF_UNIX related fields.
*/
bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
}
}
sotpi_info_t *
sotpi_sototpi(struct sonode *so)
{
sotpi_info_t *sti;
if (so == NULL)
return (NULL);
sti = (sotpi_info_t *)so->so_priv;
ASSERT(sti != NULL);
ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
return (sti);
}
static int
i_sotpi_info_constructor(sotpi_info_t *sti)
{
sti->sti_magic = SOTPI_INFO_MAGIC;
sti->sti_ack_mp = NULL;
sti->sti_discon_ind_mp = NULL;
sti->sti_ux_bound_vp = NULL;
sti->sti_unbind_mp = NULL;
sti->sti_conn_ind_head = NULL;
sti->sti_conn_ind_tail = NULL;
sti->sti_laddr_sa = NULL;
sti->sti_faddr_sa = NULL;
sti->sti_nl7c_flags = 0;
sti->sti_nl7c_uri = NULL;
sti->sti_nl7c_rcv_mp = NULL;
mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
return (0);
}
static void
i_sotpi_info_destructor(sotpi_info_t *sti)
{
ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
ASSERT(sti->sti_ack_mp == NULL);
ASSERT(sti->sti_discon_ind_mp == NULL);
ASSERT(sti->sti_ux_bound_vp == NULL);
ASSERT(sti->sti_unbind_mp == NULL);
ASSERT(sti->sti_conn_ind_head == NULL);
ASSERT(sti->sti_conn_ind_tail == NULL);
ASSERT(sti->sti_laddr_sa == NULL);
ASSERT(sti->sti_faddr_sa == NULL);
ASSERT(sti->sti_nl7c_flags == 0);
ASSERT(sti->sti_nl7c_uri == NULL);
ASSERT(sti->sti_nl7c_rcv_mp == NULL);
mutex_destroy(&sti->sti_plumb_lock);
cv_destroy(&sti->sti_ack_cv);
}
/*
* Creates and attaches TPI information to the given sonode
*/
static boolean_t
sotpi_info_create(struct sonode *so, int kmflags)
{
sotpi_info_t *sti;
ASSERT(so->so_priv == NULL);
if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
return (B_FALSE);
if (i_sotpi_info_constructor(sti) != 0) {
kmem_free(sti, sizeof (*sti));
return (B_FALSE);
}
so->so_priv = (void *)sti;
return (B_TRUE);
}
/*
* Initializes the TPI information.
*/
static void
sotpi_info_init(struct sonode *so)
{
struct vnode *vp = SOTOV(so);
sotpi_info_t *sti = SOTOTPI(so);
time_t now;
sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
vp->v_rdev = sti->sti_dev;
sti->sti_orig_sp = NULL;
sti->sti_pushcnt = 0;
now = gethrestime_sec();
sti->sti_atime = now;
sti->sti_mtime = now;
sti->sti_ctime = now;
sti->sti_eaddr_mp = NULL;
sti->sti_delayed_error = 0;
sti->sti_provinfo = NULL;
sti->sti_oobcnt = 0;
sti->sti_oobsigcnt = 0;
ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
sti->sti_laddr_sa = 0;
sti->sti_faddr_sa = 0;
sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
sti->sti_laddr_len = sti->sti_faddr_len = 0;
sti->sti_laddr_valid = 0;
sti->sti_faddr_valid = 0;
sti->sti_faddr_noxlate = 0;
sti->sti_direct = 0;
ASSERT(sti->sti_ack_mp == NULL);
ASSERT(sti->sti_ux_bound_vp == NULL);
ASSERT(sti->sti_unbind_mp == NULL);
ASSERT(sti->sti_conn_ind_head == NULL);
ASSERT(sti->sti_conn_ind_tail == NULL);
/* Initialize the kernel SSL proxy fields */
sti->sti_kssl_type = KSSL_NO_PROXY;
sti->sti_kssl_ent = NULL;
sti->sti_kssl_ctx = NULL;
}
/*
* Given a sonode, grab the TPI info and free any data.
*/
static void
sotpi_info_fini(struct sonode *so)
{
sotpi_info_t *sti = SOTOTPI(so);
mblk_t *mp;
ASSERT(sti->sti_discon_ind_mp == NULL);
if ((mp = sti->sti_conn_ind_head) != NULL) {
mblk_t *mp1;
while (mp) {
mp1 = mp->b_next;
mp->b_next = NULL;
freemsg(mp);
mp = mp1;
}
sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
}
/*
* Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
* indirect them. It also uses so_count as a validity test.
*/
mutex_enter(&so->so_lock);
if (sti->sti_laddr_sa) {
ASSERT((caddr_t)sti->sti_faddr_sa ==
(caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
sti->sti_laddr_valid = 0;
sti->sti_faddr_valid = 0;
kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
sti->sti_laddr_sa = NULL;
sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
sti->sti_faddr_sa = NULL;
sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
}
mutex_exit(&so->so_lock);
if ((mp = sti->sti_eaddr_mp) != NULL) {
freemsg(mp);
sti->sti_eaddr_mp = NULL;
sti->sti_delayed_error = 0;
}
if ((mp = sti->sti_ack_mp) != NULL) {
freemsg(mp);
sti->sti_ack_mp = NULL;
}
if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
sti->sti_nl7c_rcv_mp = NULL;
freemsg(mp);
}
sti->sti_nl7c_rcv_rval = 0;
if (sti->sti_nl7c_uri != NULL) {
nl7c_urifree(so);
/* urifree() cleared nl7c_uri */
}
if (sti->sti_nl7c_flags) {
sti->sti_nl7c_flags = 0;
}
ASSERT(sti->sti_ux_bound_vp == NULL);
if ((mp = sti->sti_unbind_mp) != NULL) {
freemsg(mp);
sti->sti_unbind_mp = NULL;
}
}
/*
* Destroys the TPI information attached to a sonode.
*/
static void
sotpi_info_destroy(struct sonode *so)
{
sotpi_info_t *sti = SOTOTPI(so);
i_sotpi_info_destructor(sti);
kmem_free(sti, sizeof (*sti));
so->so_priv = NULL;
}
/*
* Create the global sotpi socket module entry. It will never be freed.
*/
smod_info_t *
sotpi_smod_create(void)
{
smod_info_t *smodp;
smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
/*
* Initialize the smod_refcnt to 1 so it will never be freed.
*/
smodp->smod_refcnt = 1;
smodp->smod_uc_version = SOCK_UC_VERSION;
smodp->smod_dc_version = SOCK_DC_VERSION;
smodp->smod_sock_create_func = &sotpi_create;
smodp->smod_sock_destroy_func = &sotpi_destroy;
return (smodp);
}