sockcommon_subr.c revision 2c632ad51ffa982737943af0e3c0abfdbe65f571
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/socketvar.h>
#ifdef SOCK_TEST
extern int do_useracc;
extern clock_t sock_test_timelimit;
#endif /* SOCK_TEST */
#define MBLK_PULL_LEN 64
#ifdef DEBUG
#endif
int
{
so->so_acceptq_len++;
/* Wakeup a single consumer */
return (so->so_acceptq_len);
}
/*
* int so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
*
* Enqueue an incoming connection on a listening socket.
*
* Arguments:
* so - listening socket
* nso - new connection
*
* Returns:
* Number of queued connections, including the new connection
*/
int
{
int conns;
return (conns);
}
static int
{
/*
* No need to check so_error here, because it is not
* possible for a listening socket to be reset or otherwise
* disconnected.
*
* So now we just need check if it's ok to wait.
*/
if (dontblock)
return (EWOULDBLOCK);
return (EINTR);
&so->so_acceptq_lock) == 0)
return (EINTR);
}
}
--so->so_acceptq_len;
return (0);
}
/*
* int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
*
* Pulls a connection off of the accept queue.
*
* Arguments:
* so - listening socket
* dontblock - indicate whether it's ok to sleep if there are no
* connections on the queue
* nsop - Value-return argument
*
* Return values:
* 0 when a connection is successfully dequeued, in which case nsop
* is set to point to the new connection. Upon failure a non-zero
* value is returned, and the value of nsop is set to NULL.
*
* Note:
* so_acceptq_dequeue() may return prematurly if the socket is falling
* back to TPI.
*/
int
{
int error;
return (error);
}
/*
* void so_acceptq_flush(struct sonode *so)
*
* Removes all pending connections from a listening socket, and
* frees the associated resources.
*
* Arguments
* so - listening socket
*
* Return values:
* None.
*
* Note:
* The caller has to ensure that no calls to so_acceptq_enqueue() or
* so_acceptq_dequeue() occur while the accept queue is being flushed.
* So either the socket needs to be in a state where no operations
* would come in, or so_lock needs to be obtained.
*/
void
{
/*
* Since the socket is on the accept queue, there can
* only be one reference. We drop the reference and
* just blow off the socket.
*/
}
so->so_acceptq_len = 0;
}
int
{
/*
* The protocol has notified us that a connection attempt is being
* made, so before we wait for a notification to arrive we must
* clear out any errors associated with earlier connection attempts.
*/
if (nonblock)
return (EINPROGRESS);
return (EINTR);
return (EINTR);
}
/*
* Under normal circumstances, so_error should contain an error
* in case the connect failed. However, it is possible for another
* thread to come in a consume the error, so generate a sensible
* error in that case.
*/
return (ECONNREFUSED);
return (0);
}
/*
* int so_wait_connected(struct sonode *so, boolean_t nonblock,
* sock_connid_t id)
*
* Wait until the socket is connected or an error has occured.
*
* Arguments:
* so - socket
* nonblock - indicate whether it's ok to sleep if the connection has
* not yet been established
* gen - generation number that was returned by the protocol
* when the operation was started
*
* Returns:
* 0 if the connection attempt was successful, or an error indicating why
* the connection attempt failed.
*/
int
{
int error;
return (error);
}
int
{
int error;
while (so->so_snd_qfull) {
return (EPIPE);
if (dontblock)
return (EWOULDBLOCK);
return (EINTR);
if (so->so_sndtimeo == 0) {
/*
* Zero means disable timeout.
*/
} else {
now);
}
if (error == 0)
return (EINTR);
else if (error == -1)
return (EAGAIN);
}
return (0);
}
/*
* int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
*
* Wait for the transport to notify us about send buffers becoming
* available.
*/
int
{
int error = 0;
if (so->so_snd_qfull) {
}
return (error);
}
void
{
}
void
{
/* wake up everyone waiting for buffers */
}
/*
*/
int
{
int error;
if (pid != 0) {
/*
* Permissions check by sending signal 0.
* Note that when kill fails it does a
* set_errno causing the system call to fail.
*/
if (error != 0) {
return (error);
}
}
return (0);
}
/*
* Generate a SIGIO, for 'writable' events include siginfo structure,
* for read events just send the signal.
*/
/*ARGSUSED*/
static void
{
if (event & SOCKETSIG_WRITE) {
}
if (event & SOCKETSIG_READ) {
}
if (event & SOCKETSIG_URG) {
}
}
void
{
event != SOCKETSIG_URG)) {
return;
}
/*
* XXX This unfortunately still generates
* a signal when a fd is closed but
* the proc is active.
*/
return;
}
} else {
/*
* Send to process group. Hold pidlock across
* calls to socket_sigproc().
*/
}
}
}
#define MIN(a, b) ((a) < (b) ? (a) : (b))
/* Copy userdata into a new mblk_t */
mblk_t *
{
/* Nothing to do in these cases, so we're done */
goto done;
/*
* We will enter the loop below if iosize is 0; it will allocate an
* empty message block and call uiomove(9F) which will just return.
* We could avoid that with an extra check but would only slow
* down the much more likely case where iosize is larger than 0.
*/
do {
if (is_system_labeled())
else
return (head);
}
/* uiomove(9F) either returns 0 or EFAULT */
return (NULL);
}
} while (iosize > 0);
done:
*errorp = 0;
return (head);
}
mblk_t *
{
int error;
ptrdiff_t n;
/*
* max_read is the offset of the oobmark and read can not go pass
* the oobmark.
*/
do {
ASSERT(n > 0);
if (error != 0) {
return (NULL);
}
}
max_read -= n;
/*
* get rid of zero length mblks
*/
}
*errorp = 0;
return (mp);
}
static void
{
#ifdef DEBUG
} else {
#endif
}
#ifdef DEBUG
if (so_debug_length) {
}
#endif
}
/*
* Move a mblk chain (mp_head, mp_last_head) to the sonode's rcv queue so it
* can be processed by so_dequeue_msg().
*/
void
{
} else {
/*
* Append to last_head if more than one mblks, and both
*/
} else {
#ifdef DEBUG
{
}
}
#endif
}
}
}
/*
* Check flow control on a given sonode. Must have so_lock held, and
* this function will release the hold.
*/
static void
{
/*
* Open up flow control. SCTP does not have any downcalls, and
* it will clr flow ctrl in sosctp_recvmsg().
*/
(so->so_proto_handle);
}
} else {
}
}
int
{
int more = 0;
int error;
#ifdef DEBUG
if (so_debug_length) {
}
#endif
/* Check whether the caller is OK to read past the mark */
if (flags & MSG_NOMARK) {
return (EWOULDBLOCK);
}
}
/*
* First move messages from the dump area to processing area
*/
if (sodp->sod_enabled) {
/* nothing to uioamove */
/*
* try to uioamove() the data that
* has already queued.
*/
}
} else {
}
}
/*
* We can release the lock as there can only be one reader
*/
if (new_msg_head != NULL) {
}
error = 0;
(so->so_rcv_timer_tid == 0 ||
if (error) {
return (error);
}
goto again;
}
} else {
}
}
/*
* First process PROTO or PCPROTO blocks, if any.
*/
savemptail = mp;
savemptail = mp;
}
}
/*
* Now process DATA blocks, if any. Note that for sodirect
* enabled socket, uio_resid can be 0.
*/
if (copied > 0)
/* mark this mblk as processed */
} else {
}
}
/*
* Can not read beyond the oobmark
*/
if (error != 0) {
more = 0;
goto done;
}
}
if (so->so_oobmark > 0) {
if (so->so_oobmark == 0) {
so->so_oobmark = 0;
}
}
/*
* so_check_flow_control() will drop
* so->so_lock.
*/
}
}
} else {
/*
* so_check_flow_control() will drop
* so->so_lock.
*/
}
/*
* Avoid queuing a zero-length tail part of
* a message. partial_read == 1 indicates that
* we read some of the message.
*/
} else {
(flags & MSG_DUPCTRL)) {
/*
* There should only be non data mblks
*/
BPRI_HI);
if (error != 0) {
/*
* In case we
* cannot copy
* control data
* free the remaining
* data.
*/
goto done;
}
goto try_again;
}
}
/*
* putback mp
*/
}
}
/* fast check so_rcv_head if there is more data */
goto again;
}
} else if (!partial_read) {
return (error);
}
/*
* No pending data. Return right away for nonblocking
* socket, otherwise sleep waiting for data.
*/
(flags & MSG_DONTWAIT)) {
error = EWOULDBLOCK;
} else {
goto done;
}
goto again1;
}
if (so->so_rcvtimeo == 0) {
/*
* Zero means disable timeout.
*/
} else {
}
so->so_rcv_wanted = 0;
if (error == 0) {
} else if (error == -1) {
} else {
goto again1;
}
}
}
}
/*
* We are passed the mark, update state
* 4.3BSD and 4.4BSD clears the mark when peeking across it.
* The draft Posix socket spec states that the mark should
* not be cleared when peeking. We follow the latter.
*/
}
done:
if (sodp->sod_enabled &&
if (error == EWOULDBLOCK)
error = 0;
}
}
}
#ifdef DEBUG
if (so_debug_length) {
}
#endif
return (error);
}
/*
* Enqueue data from the protocol on the socket's rcv queue.
*
* We try to hook new M_DATA mblks onto an existing chain, however,
* that cannot be done if the existing chain has already been
* b_next. In all cases the b_prev of the enqueued mblk is set to
* point to the last mblk in its b_cont chain.
*/
void
{
#ifdef DEBUG
if (so_debug_length) {
}
#endif
/* Added to the end */
} else {
/* Start a new end */
}
#ifdef DEBUG
if (so_debug_length) {
}
#endif
}
/*
* Return B_TRUE if there is data in the message, B_FALSE otherwise.
*/
{
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Flush the read side of sockfs.
*
* The caller must be sure that a reader is not already active when the
* buffer is being flushed.
*/
void
{
so->so_oobmark = 0;
}
/*
* Free messages sitting in the send and recv queue
*/
}
}
so->so_rcv_queued = 0;
}
/*
* Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
*/
int
{
int error;
flags));
/*
* There is never any oob data with addresses or control since
* the T_EXDATA_IND does not carry any options.
*/
msg->msg_controllen = 0;
msg->msg_namelen = 0;
}
if (oob_inline ||
return (EINVAL);
}
return (EWOULDBLOCK);
}
/*
* Since recv* can not return ENOBUFS we can not use dupmsg.
* Instead we revert to the consolidation private
* allocb_wait plus bcopy.
*/
}
} else {
/*
* Update the state indicating that the data has been consumed.
* Keep SS_OOBPEND set until data is consumed past the mark.
*/
}
error = 0;
if (n > 0)
if (error)
break;
}
return (error);
}
/*
* Allocate and initializ sonode
*/
/* ARGSUSED */
struct sonode *
{
int kmflags;
/*
* Choose the right set of sonodeops based on the upcall and
* down call version that the protocol has provided
*/
/*
* mismatch
*/
#ifdef DEBUG
#endif
return (NULL);
}
return (NULL);
}
if (version == SOV_DEFAULT)
/*
* set the default values to be INFPSZ
* if a protocol desires it can change the value later
*/
return (so);
}
int
{
int error = 0;
/*
* We have a passive open, so inherit basic state from
* the parent (listener).
*
* No need to grab the new sonode's lock, since there is no
* one that can have a reference to it.
*/
/*
* Make note of the socket level options. TCP and IP level
* options are already inherited. We could do all this after
* accept is successful but doing it here simplifies code and
* no harm done for error case.
*/
} else {
/*
* Based on the version number select the right upcalls to
* pass down. Currently we only have one version so choose
* default
*/
/* active open, so create a lower handle */
/*
* To be safe; if a lower handle cannot be created, and
* the proto does not give a reason why, assume there
* was a lack of memory.
*/
}
}
/* Wildcard */
/*
* FIXME No need for this, the protocol can deal with it in
* sd_create(). Should update ICMP.
*/
int error;
/*
* Issue SO_PROTOTYPE setsockopt.
*/
if (error) {
/*
* Setsockopt often fails with ENOPROTOOPT but
* socket() should fail with
*/
return (EPROTONOSUPPORT);
}
}
}
return (0);
}
/*
* int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
* struct cred *cr, int32_t *rvalp)
*
* Handle ioctls that manipulate basic socket state; non-blocking,
* async, etc.
*
* Returns:
* < 0 - ioctl was not handle
* >= 0 - ioctl was handled, if > 0, then it is an errno
*
* Notes:
* Assumes the standard receive buffer is used to obtain info for
* NREAD.
*/
/* ARGSUSED */
int
{
switch (cmd) {
case SIOCSQPTR:
/*
* SIOCSQPTR is valid only when helper stream is created
* by the protocol.
*/
return (EOPNOTSUPP);
case FIONBIO: {
return (EFAULT);
if (value) {
} else {
}
return (0);
}
case FIOASYNC: {
return (EFAULT);
if (value) {
/* Turn on SIGIO */
} else {
/* Turn off SIGIO */
}
return (0);
}
case SIOCSPGRP:
case FIOSETOWN: {
int error;
return (EFAULT);
return (error);
}
case SIOCGPGRP:
case FIOGETOWN:
return (EFAULT);
return (0);
case SIOCATMARK: {
int retval;
/*
* Only protocols that support urgent data can handle ATMARK.
*/
return (EINVAL);
/*
* If the protocol is maintaining its own buffer, then the
* request must be passed down.
*/
return (-1);
return (EFAULT);
}
return (0);
}
case FIONREAD: {
int retval;
/*
* If the protocol is maintaining its own buffer, then the
* request must be passed down.
*/
return (-1);
return (EFAULT);
}
return (0);
}
case _I_GETPEERCRED: {
int error = 0;
return (EINVAL);
} else {
}
return (error);
}
default:
return (-1);
}
}
/*
* Handle the I_NREAD STREAM ioctl.
*/
static int
{
int retval;
int count = 0;
return (EINVAL);
/* Wait for reader to get out of the way. */
/*
* If reader is waiting for data, then there should be nothing
* on the rcv queue.
*/
if (so->so_rcv_wakeup)
goto out;
/* Do a timed sleep, in case the reader goes to sleep. */
}
/*
* Since we are holding so_lock no new reader will come in, and the
* protocol will not be able to enqueue data. So it's safe to walk
* both rcv queues.
*/
count++;
} else {
/*
* In case the processing list was empty, get the size of the
* next msg in line.
*/
}
count++;
out:
/*
* Drop down from size_t to the "int" required by the
* interface. Cap at INT_MAX.
*/
return (EFAULT);
} else {
return (0);
}
}
/*
* Process STREAM ioctls.
*
* Returns:
* < 0 - ioctl was not handle
* >= 0 - ioctl was handled, if > 0, then it is an errno
*/
int
{
int retval;
/* Only STREAM iotcls are handled here */
return (-1);
switch (cmd) {
case I_CANPUT:
/*
* We return an error for I_CANPUT so that isastream(3C) will
* not report the socket as being a STREAM.
*/
return (EOPNOTSUPP);
case I_NREAD:
/* Avoid doing a fallback for I_NREAD. */
case I_LOOK:
/* Avoid doing a fallback for I_LOOK. */
return (EFAULT);
}
return (0);
default:
break;
}
/*
* Try to fall back to TPI, and if successful, reissue the ioctl.
*/
/* Reissue the ioctl */
} else {
return (retval);
}
}
/*
* This is called for all socket types to verify that the buffer size is large
* enough for the option, and if we can, handle the request as well. Most
* options will be forwarded to the protocol.
*/
int
{
if (level != SOL_SOCKET)
return (-1);
switch (option_name) {
case SO_ERROR:
case SO_DOMAIN:
case SO_TYPE:
case SO_ACCEPTCONN: {
return (EINVAL);
}
switch (option_name) {
case SO_ERROR:
break;
case SO_DOMAIN:
break;
case SO_TYPE:
break;
case SO_ACCEPTCONN:
else
value = 0;
break;
}
return (0);
}
case SO_SNDTIMEO:
case SO_RCVTIMEO: {
if (get_udatamodel() == DATAMODEL_NONE ||
get_udatamodel() == DATAMODEL_NATIVE) {
return (EINVAL);
} else {
return (EINVAL);
}
if (option_name == SO_RCVTIMEO)
else
if (get_udatamodel() == DATAMODEL_NONE ||
get_udatamodel() == DATAMODEL_NATIVE) {
} else {
}
return (0);
}
case SO_DEBUG:
case SO_REUSEADDR:
case SO_KEEPALIVE:
case SO_DONTROUTE:
case SO_BROADCAST:
case SO_USELOOPBACK:
case SO_OOBINLINE:
case SO_SNDBUF:
#ifdef notyet
case SO_SNDLOWAT:
case SO_RCVLOWAT:
#endif /* notyet */
case SO_DGRAM_ERRIND: {
return (EINVAL);
break;
}
case SO_RCVBUF: {
return (EINVAL);
/*
* XXX If SO_RCVBUF has been set and this is an
* XPG 4.2 application then do not ask the transport
* since the transport might adjust the value and not
* return exactly what was set by the application.
* For non-XPG 4.2 application we return the value
* that the transport is actually using.
*/
return (0);
}
/*
* If the option has not been set then get a default
* value from the transport.
*/
break;
}
case SO_LINGER: {
return (EINVAL);
break;
}
case SO_SND_BUFINFO: {
return (EINVAL);
*optlenp = sizeof (struct so_snd_bufinfo);
return (0);
}
default:
break;
}
/* Unknown Option */
return (-1);
}
void
{
}
int
{
int error = 0;
return (EINTR);
}
break;
}
}
return (error);
}
void
so_timer_callback(void *arg)
{
so->so_rcv_timer_tid = 0;
if (so->so_rcv_queued > 0) {
} else {
}
}
#ifdef DEBUG
/*
* Verify that the length stored in so_rcv_queued and the length of data blocks
* queued is same.
*/
static boolean_t
{
int len = 0;
}
}
}
#endif
int
{
}
/*
* so_start_fallback()
*
* Block new socket operations from coming in, and wait for active operations
* to complete. Threads that are sleeping will be woken up so they can get
* out of the way.
*
* The caller must be a reader on so_fallback_rwlock.
*/
static boolean_t
{
return (B_FALSE);
}
/*
* Poke all threads that might be sleeping. Any operation that comes
* in after the cv_broadcast will observe the fallback pending flag
* which cause the call to return where it would normally sleep.
*/
/*
* The main reason for the rw_tryupgrade call is to provide
* observability during the fallback process. We want to
* be able to see if there are pending operations.
*/
/*
* It is safe to drop and reaquire the fallback lock, because
* we are guaranteed that another fallback cannot take place.
*/
}
return (B_TRUE);
}
/*
* so_end_fallback()
*
* Allow socket opertions back in.
*
* The caller must be a writer on so_fallback_rwlock.
*/
static void
{
}
/*
* so_quiesced_cb()
*
* Callback passed to the protocol during fallback. It is called once
* the endpoint is quiescent.
*
* No requests from the user, no notifications from the protocol, so it
* is safe to synchronize the state. Data can also be moved without
* risk for reordering.
*
* We do not need to hold so_lock, since there can be only one thread
* operating on the sonode.
*/
static void
{
/*
* Some protocols do not quiece the data path during fallback. Once
* we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
* fail and the protocol is responsible for saving the data for later
* delivery (i.e., once the fallback has completed).
*/
else
}
/*
* Clear any OOB state having to do with pending data. The TPI
* code path will set the appropriate oob state when we move the
* oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
* data has already been consumed.
*/
/*
* Move data to the STREAM head.
*/
/*
* Send T_EXDATA_IND if we are at the oob mark.
*/
if (atmark) {
struct T_exdata_ind *tei;
if (IS_SO_OOB_INLINE(so)) {
} else {
/* process current mp next time around */
mlen = 0;
}
/* we have consumed the oob mark */
} else if (so->so_oobmark > 0) {
/*
* Check if the OOB mark is within the current
* mblk chain. In that case we have to split it up.
*/
/*
* It is assumed that the OOB mark does
* not land within a mblk.
*/
do {
} while (so->so_oobmark > 0);
}
} else {
if (so->so_oobmark == 0)
}
}
/*
* Queue data on the STREAM head.
*/
}
/*
* Check if the oob byte is at the end of the data stream, or if the
* oob byte has not yet arrived. In the latter case we have to send a
* SIGURG and a mark indicator to the STREAM head. The mark indicator
* is needed to guarantee correct behavior for SIOCATMARK. See block
* comment in socktpi.h for more details.
*/
struct T_exdata_ind *tei;
} else {
/* Send up the signal */
/* Send up the mark indicator */
so->so_oobmark = 0;
}
}
}
}
}
#ifdef DEBUG
/*
* Do an integrity check of the sonode. This should be done if a
* fallback fails after sonode has initially been converted to use
* TPI and subsequently have to be reverted.
*
* Failure to pass the integrity check will panic the system.
*/
void
{
/*
* For so_state we can only VERIFY the state flags in CHECK_STATE.
* The other state flags might be affected by a notification from the
* protocol.
*/
/* Cannot VERIFY so_proto_connid; proto can update it */
/* an error might have been recorded, but it can not be lost */
/* New conns might have arrived, but none should have been lost */
/* New OOB migth have arrived, but mark should not have been lost */
/* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
/* New data migth have arrived, but none should have been lost */
/* Cannot VERIFY so_proto_props; they can be updated by proto */
}
#endif
/*
* so_tpi_fallback()
*
* This is the fallback initation routine; things start here.
*
* Basic strategy:
* o Block new socket operations from coming in
* o Quiesce the connection, at which point we sync
* state and move data
* o Change operations (sonodeops) associated with the socket
* o Unblock threads waiting for the fallback to finish
*/
int
{
int error;
queue_t *q;
struct sockparams *sp;
#ifdef DEBUG
#endif
error = 0;
/*
* Fallback can only happen if there is a device associated
* with the sonode, and the socket module has a fallback function.
*/
return (EINVAL);
/*
* Initiate fallback; upon success we know that no new requests
* will come in from the user.
*/
if (!so_start_fallback(so))
return (EAGAIN);
#ifdef DEBUG
/*
* Make a copy of the sonode in case we need to make an integrity
* check later on.
*/
#endif
if (error != 0)
goto out;
}
/* Turn sonode into a TPI socket */
if (error != 0)
goto out;
/*
* Now tell the protocol to start using TPI. so_quiesced_cb be
* called once it's safe to synchronize state.
*/
if (error != 0) {
/* protocol was unable to do a fallback, revert the sonode */
goto out;
}
/*
* Walk the accept queue and notify the proto that they should
* fall back to TPI. The protocol will send up the T_CONN_IND.
*/
int rval;
if (rval != 0) {
"Failed to convert socket in accept queue to TPI. "
}
}
/*
* Now flush the acceptq, this will destroy all sockets. They will
* be recreated in sotpi_accept().
*/
/*
* Swap the sonode ops. Socket opertations that come in once this
* is done will proceed without blocking.
*/
/*
* Wake up any threads stuck in poll. This is needed since the poll
* head changes when the fallback happens (moves from the sonode to
* the STREAMS head).
*/
out:
if (error != 0) {
#ifdef DEBUG
#endif
"Failed to convert socket to TPI (err=%d). Pid = %d\n",
}
return (error);
}