socksubr.c revision bd118333506194b55077122465f5051a4e3ac349
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/sysmacros.h>
#include <sys/pathname.h>
#include <sys/socketvar.h>
#define _SUN_TPI_VERSION 2
/*
* Macros that operate on struct cmsghdr.
* The CMSG_VALID macro does not assume that the last option buffer is padded.
*/
(ISALIGNED_cmsghdr(cmsg) && \
struct sockparams *sphead;
static int sockfs_update(kstat_t *, int);
static int sockfs_snapshot(kstat_t *, void *, int);
extern void sendfile_init();
extern void nl7c_init(void);
/*
* kernel structure for passing the sockinfo data back up to the user.
* the strings array allows us to convert AF_UNIX addresses into strings
* with a common method regardless of which n-bit kernel we're running.
*/
struct k_sockinfo {
};
/*
* Returns with the vnode held.
*/
static int
{
int error;
/*
* Lookup the underlying filesystem vnode.
*/
if (error)
return (error);
/* Check that it is the correct vnode */
return (ENOTSOCK);
}
/*
* If devpath went through devfs, the device should already
* be configured. If devpath is a mknod file, however, we
* need to make sure the device is properly configured.
* To do this, we do something similar to spec_open()
* we need to return a vnode.
*/
if (error == 0)
if (error != 0)
return (ENXIO);
}
/* device is configured at this point */
if (!STREAMSTAB(maj)) {
return (ENOSTR);
}
return (0);
}
/*
* Add or delete (latter if devpath is NULL) an enter to the sockparams
* table. If devpathlen is zero the devpath with not be kmem_freed. Otherwise
* this routine assumes that the caller has kmem_alloced devpath/devpathlen
* for this routine to consume.
* The zero devpathlen could be used if the kernel wants to create entries
*/
int
char *devpath, int devpathlen)
{
struct sockparams **spp;
struct sockparams *sp;
int error = 0;
dprint(0, ("soconfig(%d,%d,%d,%s,%d)\n",
/*
* Look for an existing match.
*/
break;
}
}
ASSERT(devpathlen == 0);
/* Delete existing entry */
goto done;
}
/* Unlink and free existing entry */
if (sp->sp_devpathlen != 0)
} else {
/* Add new entry */
goto done;
}
if (error) {
dprint(0, ("soconfig: vp %s failed with %d\n",
goto done;
}
dprint(0, ("soconfig: %s => vp %p, dev 0x%lx\n",
}
done:
if (error) {
#ifdef SOCK_DEBUG
#endif /* SOCK_DEBUG */
}
return (error);
}
/*
* Lookup an entry in the sockparams list based on the triple.
* If no entry is found and devpath is not NULL translate devpath to a
* vnode. Note that devpath is a pointer to a user address!
* Returns with the vnode held.
*
* When this routine uses devpath it does not create an entry in the sockparams
* list since this routine can run on behalf of any user and one user
* should not be able to effect the transport used by another user.
*
* In order to return the correct error this routine has to do wildcard scans
* of the list. The errors are (in decreasing precedence):
* EAFNOSUPPORT - address family not in list
* EPROTONOSUPPORT - address family supported but not protocol.
* EPROTOTYPE - address family and protocol supported but not socket type.
*/
vnode_t *
{
struct sockparams *sp;
int error;
break;
}
}
dprint(0, ("solookup(%d,%d,%d) not found\n",
/* Determine correct error code */
int found = 0;
found = 1;
found = 2;
}
switch (found) {
case 0:
*errorp = EAFNOSUPPORT;
break;
case 1:
break;
case 2:
*errorp = EPROTOTYPE;
break;
}
return (NULL);
}
/*
* Return vp based on devpath.
* Do not enter into table to avoid random users
* modifying the sockparams list.
*/
if (error) {
dprint(0, ("solookup: vp %p failed with %d\n",
return (NULL);
}
dprint(0, ("solookup: %p => vp %p, dev 0x%lx\n",
return (vp);
}
dprint(0, ("solookup(%d,%d,%d) vp %p devpath %s\n",
return (vp);
}
/*
* Return a socket vnode.
*
* Assumes that the caller is "passing" an VN_HOLD for accessvp i.e.
* when the socket is freed a VN_RELE will take place.
*
* Note that sockets assume that the driver will clone (either itself
* or by using the clone driver) i.e. a socket() call will always
* result in a new vnode being created.
*/
struct vnode *
{
now = gethrestime_sec();
/*
* Record in so_flag that it is a clone.
*/
}
so->so_pushcnt = 0;
so->so_options = 0;
so->so_sndlowat = 0;
so->so_rcvlowat = 0;
#ifdef notyet
so->so_sndtimeo = 0;
so->so_rcvtimeo = 0;
#endif /* notyet */
so->so_delayed_error = 0;
so->so_oobsigcnt = 0;
return (vp);
}
void
{
/*
* Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
* indirect them. It also uses so_accessvp as a validity test.
*/
if (so->so_laddr_sa) {
}
so->so_delayed_error = 0;
}
}
while (mp) {
}
}
#ifdef DEBUG
#endif /* DEBUG */
}
}
so->so_nl7c_rcv_rval = 0;
}
so->so_nl7c_flags = 0;
}
}
/*
* Update the accessed, updated, or changed times in an sonode
* with the current time.
*
* Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable
* attributes in a fstat call. (They return the current time and 0 for
* all timestamps, respectively.) We maintain the current timestamps
* here primarily so that should sockmod be popped the resulting
* file descriptor will behave like a stream w.r.t. the timestamps.
*/
void
{
}
/*ARGSUSED*/
static int
{
so->so_nl7c_flags = 0;
return (0);
}
/*ARGSUSED1*/
static void
{
}
static int
{
int retval;
}
return (retval);
}
static void
{
else
}
/*
* Init function called when sockfs is loaded.
*/
int
{
static const fs_operation_def_t sock_vfsops_template[] = {
};
int error;
char *err_str;
if (error != 0) {
return (error);
}
if (error != 0) {
err_str = "sockinit: bad sock vnode ops template";
/* vn_make_ops() does not reset socktpi_vnodeops on failure. */
goto failure;
}
if (error != 0) {
err_str = "sockinit: bad nca vnode ops template";
goto failure;
}
error = sosctp_init();
if (error != 0) {
goto failure;
}
/*
* Create sonode caches. We create a special one for AF_UNIX so
* that we can track them for netstat(1m).
*/
sizeof (struct sonode), 0, socktpi_constructor,
sizeof (struct sonode), 0, socktpi_unix_constructor,
/*
* Build initial list mapping socket parameters to vnode.
*/
/*
* it is possible to preload the sockparams list here using
* calls like:
*/
/*
* Create a unique dev_t for use in so_fsid.
*/
dev = 0;
sonca_init();
nl7c_init();
return (0);
(void) vfs_freevfsops_by_type(fstype);
if (socktpi_vnodeops != NULL)
if (socknca_vnodeops != NULL)
return (error);
}
/*
* Caller must hold the mutex. Used to set SOLOCKED.
*/
void
{
}
}
/*
* Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND.
* Used to clear SOLOCKED or SOASYNC_UNBIND.
*/
void
{
/*
* Process the T_DISCON_IND on so_discon_ind_mp.
*
* Call to so_drain_discon_ind will result in so_lock
* being dropped and re-acquired later.
*/
}
/*
* Caller must hold the mutex. Used to set SOREADLOCKED.
* If the caller wants nonblocking behavior it should set fmode.
*/
int
{
return (EWOULDBLOCK);
}
return (0);
}
/*
* Like so_lock_read above but allows signals.
*/
int
{
return (EWOULDBLOCK);
return (EINTR);
}
return (0);
}
/*
* Caller must hold the mutex. Used to clear SOREADLOCKED,
* set in so_lock_read() or so_lock_read_intr().
*/
void
{
}
/*
* Verify that the specified offset falls within the mblk and
* that the resulting pointer is aligned.
* Returns NULL if not.
*/
void *
{
eprintline(0);
return (NULL);
}
eprintline(0);
return (NULL);
}
return ((void *)ptr1);
}
/*
* Return the AF_UNIX underlying filesystem vnode matching a given name.
* Makes sure the sending and the destination sonodes are compatible.
* The vnode is returned held.
*
* The underlying filesystem VSOCK vnode has a v_stream pointer that
* references the actual stream head (hence indirectly the actual sonode).
*/
static int
{
int error;
if (error) {
return (error);
}
goto done2;
}
if (checkaccess) {
/*
* Check that we have permissions to access the destination
* vnode. This check is not done in BSD but it is required
*/
goto done2;
}
}
/*
* Check if the remote socket has been closed.
*
* Synchronize with vn_rele_stream by holding v_lock while traversing
* v_stream->sd_vnode.
*/
else
goto done2;
}
/*
* holding v_lock on underlying filesystem vnode and acquiring
* it on sockfs vnode. Assumes that no code ever attempts to
* acquire these locks in the reverse order.
*/
goto done;
}
error = EPROTOTYPE;
goto done;
}
return (0);
done:
return (error);
}
/*
* provider we have to do these ugly checks in the socket layer to
* preserve compatibility with SunOS 4.X.
*/
int
{
int family;
switch (family) {
case AF_INET:
return (EAFNOSUPPORT);
}
return (EINVAL);
}
break;
case AF_INET6: {
#ifdef DEBUG
struct sockaddr_in6 *sin6;
#endif /* DEBUG */
return (EAFNOSUPPORT);
}
return (EINVAL);
}
#ifdef DEBUG
/* Verify that apps don't forget to clear sin6_scope_id etc */
if (sin6->sin6_scope_id != 0 &&
"(%d) on socket. Pid = %d\n",
}
#endif /* DEBUG */
break;
}
case AF_UNIX:
return (0);
}
return (ENOENT);
}
return (EAFNOSUPPORT);
}
/* MAXPATHLEN + soun_family + nul termination */
return (ENAMETOOLONG);
}
break;
default:
/*
* Default is don't do any length or sa_family check
* to allow non-sockaddr style addresses.
*/
break;
}
return (0);
}
/*
* Translate an AF_UNIX sockaddr_un to the transport internal name.
* Assumes caller has called so_addr_verify first.
*/
/*ARGSUSED*/
int
{
int error;
struct sockaddr_un *soun;
void *addr;
/*
* Lookup vnode for the specified path name and verify that
* it is a socket.
*/
if (error) {
return (error);
}
/*
* Use the address of the peer vnode as the address to send
* to. We release the peer vnode here. In case it has been
* closed by the time the T_CONN_REQ or T_UNIDATA_REQ reaches the
* transport the message will get an error or be dropped.
*/
return (0);
}
/*
* Esballoc free function for messages that contain SO_FILEP option.
* Decrement the reference count on the file pointers using closef.
*/
void
{
int i;
/*
* We need pointer size alignment for fd_fds. On a LP64
* kernel, the required alignment is 8 bytes while
* the option headers and values are only 4 bytes
* aligned. So its safer to do a bcopy compared to
* assigning fdbuf->fd_fds[i] to fp.
*/
}
}
/*
* Allocate an esballoc'ed message for AF_UNIX file descriptor passing.
* Waits if memory is not available.
*/
mblk_t *
{
return (mp);
}
/*
* Extract file descriptors from a fdbuf.
*/
/*ARGSUSED*/
static int
{
int i, fd;
int *rp;
int numfd;
/*
* Allocate a file descriptor and increment the f_count.
* The latter is needed since we always call fdbuf_free
* which performs a closef.
*/
for (i = 0; i < numfd; i++) {
goto cleanup;
/*
* We need pointer size alignment for fd_fds. On a LP64
* kernel, the required alignment is 8 bytes while
* the option headers and values are only 4 bytes
* aligned. So its safer to do a bcopy compared to
* assigning fdbuf->fd_fds[i] to fp.
*/
#ifdef C2_AUDIT
if (audit_active)
#endif
}
return (0);
/*
* Undo whatever partial work the loop above has done.
*/
{
int j;
for (j = 0; j < i; j++) {
dprint(0,
("fdbuf_extract: cleanup[%d] = %d\n", j, *rp));
}
}
return (EMFILE);
}
/*
* Insert file descriptors into an fdbuf.
* Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed
* by calling fdbuf_free().
*/
int
{
int numfd, i;
int *fds;
int fdbufsize;
fdbuf->fd_ebuflen = 0;
for (i = 0; i < numfd; i++) {
return (EBADF);
}
/*
* The maximum alignment for fdbuf (or any option header
* and its value) it 4 bytes. On a LP64 kernel, the alignment
* is not sufficient for pointers (fd_fds in this case). Since
* we just did a kmem_alloc (we get a double word alignment),
* we don't need to do anything on the send side (we loose
* the double word alignment because fdbuf goes after an
* option header (eg T_unitdata_req) which is only 4 byte
* aligned). We take care of this when we extract the file
* descriptor in fdbuf_extract or fdbuf_free.
*/
#ifdef C2_AUDIT
if (audit_active)
#endif
}
return (0);
}
static int
fdbuf_optlen(int rightslen)
{
int numfd;
}
static t_uscalar_t
fdbuf_cmsglen(int fdbuflen)
{
(int)sizeof (struct file *) * (int)sizeof (int));
}
/*
* Return non-zero if the mblk and fdbuf are consistent.
*/
static int
{
if (fdbuflen >= FDBUF_HDRSIZE &&
/*
* Check that the SO_FILEP portion of the
* message has not been modified by
* the loopback transport. The sending sockfs generates
* a message that is esballoc'ed with the free function
* being fdbuf_free() and where free_arg contains the
* identical information as the SO_FILEP content.
*
* If any of these constraints are not satisfied we
* silently ignore the option.
*/
return (1);
} else {
"sockfs: mismatched fdbuf content (%p)",
(void *)mp);
return (0);
}
} else {
"sockfs: mismatched fdbuf len %d, %d\n",
return (0);
}
}
/*
* When the file descriptors returned by sorecvmsg can not be passed
* to the application this routine will cleanup the references on
* the files. Start at startoff bytes into the buffer.
*/
static void
{
int i;
for (i = 0; i < numfd; i++) {
if (startoff < 0)
startoff = 0;
if (startoff < (int)sizeof (int)) {
/*
* This file descriptor is partially or fully after
* the offset
*/
dprint(0,
("close_fds: cleanup[%d] = %d\n", i, fds[i]));
}
startoff -= (int)sizeof (int);
}
}
/*
* Close all file descriptors contained in the control part starting at
* the startoffset.
*/
void
int startoff)
{
return;
if (oldflg) {
return;
}
/* Scan control part for file descriptors. */
(int)CMSG_CONTENTLEN(cmsg),
}
}
}
/*
* in the control buffer. Returns with *fdlenp == -1 if there are no
* file descriptor options present. This is different than there being
* a zero-length file descriptor option.
* Fail if there are multiple SCM_RIGHT cmsgs.
*/
int
{
void *fds;
int fdlen;
*fdlenp = -1;
return (0);
}
if (oldflg) {
if (controllen == 0)
*fdlenp = -1;
else
*fdlenp = controllen;
return (0);
}
fdlen = 0;
return (EINVAL);
}
}
*fdlenp = -1;
} else
return (0);
}
/*
* Return the length of the options including any file descriptor options.
*/
{
t_uscalar_t optlen = 0;
return (0);
if (oldflg)
return ((t_uscalar_t)(sizeof (struct T_opthdr) +
} else {
}
sizeof (struct T_opthdr));
}
return (optlen);
}
/*
* Copy options from control to the mblk. Skip any file descriptor options.
*/
void
{
return;
if (oldflg) {
/* No real options - caller has handled file descriptors */
return;
}
/*
* Note: The caller handles file descriptors prior
* to calling this function.
*/
continue;
}
}
/*
* Return the length of the control message derived from the options.
* Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP.
* When oldflg is set only include SO_FILEP.
*/
{
t_uscalar_t cmsglen = 0;
t_uscalar_t last_roundup = 0;
continue;
}
int fdbuflen;
continue;
if (oldflg) {
continue;
}
} else {
if (oldflg)
continue;
}
/*
* Exlucde roundup for last option to not set
* MSG_CTRUNC when the cmsg fits but the padding doesn't fit.
*/
}
cmsglen -= last_roundup;
return (cmsglen);
}
/*
* Copy options from options to the control. Convert SO_FILEP to
* file descriptors.
* Returns errno or zero.
*/
int
{
int fdbuflen;
int error;
continue;
}
return (EPROTO);
if (oldflg) {
(int)controllen);
if (error != 0)
return (error);
continue;
} else {
int fdlen;
fdlen = (int)fdbuf_cmsglen(
(int)_TPI_TOPT_DATALEN(tohp));
sizeof (struct cmsghdr));
if (error != 0)
return (error);
}
} else {
if (oldflg)
continue;
sizeof (struct cmsghdr));
/* copy content to control data part */
}
/* move to next CMSG structure! */
}
return (0);
}
/*
* Extract the SO_SRCADDR option value if present.
*/
void
{
*srclenp = 0;
}
}
}
/*
* Verify if the SO_UNIX_CLOSE option is present.
*/
int
{
dprint(1,
("so_getopt_unix_close: level 0x%x, name %d, len %d\n",
return (1);
}
return (0);
}
/*
* Allocate an M_PROTO message.
*
* If allocation fails the behavior depends on sleepflg:
* _ALLOC_NOSLEEP fail immediately
* _ALLOC_INTR sleep for memory until a signal is caught
* _ALLOC_SLEEP sleep forever. Don't return NULL.
*/
mblk_t *
{
/* Round up size for reuse */
int error; /* Dummy - error not returned to caller */
switch (sleepflg) {
case _ALLOC_SLEEP:
break;
case _ALLOC_INTR:
/* Caught signal while sleeping for memory */
return (NULL);
}
break;
case _ALLOC_NOSLEEP:
default:
return (NULL);
}
}
return (mp);
}
/*
* Allocate an M_PROTO message with a single component.
* len is the length of buf. size is the amount to allocate.
*
* buf can be NULL with a non-zero len.
* This results in a bzero'ed chunk being placed the message.
*/
mblk_t *
{
if (size == 0)
/* Round up size for reuse */
return (NULL);
if (len != 0) {
else
}
return (mp);
}
/*
* The caller has to ensure that there is enough room in the mblk.
*
* buf can be NULL with a non-zero len.
* This results in a bzero'ed chunk being placed the message.
*/
void
{
if (len != 0) {
/* Assert for room left */
else
}
}
/*
* Create a message using two kernel buffers.
* If size is set that will determine the allocation size (e.g. for future
* soappendmsg calls). If size is zero it is derived from the buffer
* lengths.
*/
mblk_t *
{
if (size == 0)
if (mp)
return (mp);
}
/*
* Create a message using three kernel buffers.
* If size is set that will determine the allocation size (for future
* soappendmsg calls). If size is zero it is derived from the buffer
* lengths.
*/
mblk_t *
{
if (size == 0)
}
return (mp);
}
#ifdef DEBUG
char *
{
static char buf[1024];
buf[0] = 0;
if (state & SS_ISCONNECTED)
if (state & SS_ISCONNECTING)
if (state & SS_ISDISCONNECTING)
if (state & SS_CANTSENDMORE)
if (state & SS_CANTRCVMORE)
if (state & SS_ISBOUND)
if (state & SS_NONBLOCK)
if (state & SS_ACCEPTCONN)
if (state & SS_HASCONNIND)
if (state & SS_SAVEDEOR)
if (state & SS_RCVATMARK)
if (state & SS_OOBPEND)
if (state & SS_HAVEOOBDATA)
if (state & SS_HADOOBDATA)
if (state & SS_FADDR_NOXLATE)
if (mode & SM_CONNREQUIRED)
if (mode & SM_FDPASSING)
if (mode & SM_OPTDATA)
if (mode & SM_BYTESTREAM)
return (buf);
}
char *
{
static char buf[1024];
return (buf);
}
switch (family) {
case AF_INET: {
struct sockaddr_in sin;
break;
}
case AF_INET6: {
struct sockaddr_in6 sin6;
break;
}
case AF_UNIX: {
break;
}
default:
break;
}
return (buf);
}
/* The logical equivalence operator (a if-and-only-if b) */
#define EQUIV(a, b) (((a) && (b)) || (!(a) && (!(b))))
/*
* Verify limitations and invariants on oob state.
* Return 1 if OK, otherwise 0 so that it can be used as
* ASSERT(verify_oobstate(so));
*/
int
{
/*
* The possible state combinations are:
* 0
* SS_OOBPEND
* SS_OOBPEND|SS_HAVEOOBDATA
* SS_OOBPEND|SS_HADOOBDATA
* SS_HADOOBDATA
*/
case 0:
case SS_OOBPEND:
case SS_OOBPEND|SS_HAVEOOBDATA:
case SS_OOBPEND|SS_HADOOBDATA:
case SS_HADOOBDATA:
break;
default:
printf("Bad oob state 1 (%p): counts %d/%d state %s\n",
return (0);
}
/* SS_RCVATMARK should only be set when SS_OOBPEND is set */
printf("Bad oob state 2 (%p): counts %d/%d state %s\n",
return (0);
}
/*
* (so_oobsigcnt != 0 or SS_RCVATMARK) iff SS_OOBPEND
*/
printf("Bad oob state 3 (%p): counts %d/%d state %s\n",
return (0);
}
/*
* Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA
*/
printf("Bad oob state 4 (%p): counts %d/%d state %s\n",
return (0);
}
printf("Bad oob state 5 (%p): counts %d/%d state %s\n",
return (0);
}
return (1);
}
#endif /* DEBUG */
/* initialize sockfs zone specific kstat related items */
void *
{
}
return (ksp);
}
/* tear down sockfs zone specific kstat related items */
/*ARGSUSED*/
void
{
}
}
/*
* Zones:
* Note that nactive is going to be different for each zone.
* This means we require kstat to call sockfs_update and then sockfs_snapshot
* for the same zone, or sockfs_snapshot will be taken into the wrong size
* buffer. This is safe, but if the buffer is too small, user will not be
* given details of all sockets. However, as this kstat has a ks_lock, kstat
* driver will keep it locked between the update and the snapshot, so no
* other process (zone) can currently get inbetween resulting in a wrong size
* buffer allocation.
*/
static int
{
return (EACCES);
}
nactive++;
}
}
return (0);
}
static int
{
int ns; /* # of sonodes we've copied */
return (EACCES);
}
/*
* for each sonode on the socklist, we massage the important
* info into buf, in k_sockinfo format.
*/
/* only stuff active sonodes and the same zone: */
continue;
}
/*
* If the sonode was activated between the update and the
* snapshot, we're done - as this is only a snapshot.
*/
break;
}
/* copy important info into buf: */
if (sn_len != 0) {
/* AF_UNIX socket names are NULL terminated */
}
}
if (sn_len != 0) {
}
}
ns++;
pksi++;
}
return (0);
}
{
int error = 0;
int iovcnt = 0;
short fflag;
rwflag = 0;
iovcnt = 1;
/* If read sync is not asked for, filter sync flags */
error = 0;
out:
if (error != 0) {
return (0);
} else {
*err = 0;
return (cnt);
}
}