lx_brand/common/socket.c

	socket.c revision 9acbbeaf2a1ffe5c14b244867d427714fab43c5c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <libintl.h>
#include <strings.h>
#include <alloca.h>

#include <sys/param.h>
#include <sys/brand.h>
#include <sys/syscall.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/un.h>
#include <netinet/tcp.h>
#include <netinet/igmp.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/lx_debug.h>
#include <sys/lx_syscall.h>
#include <sys/lx_socket.h>
#include <sys/lx_brand.h>
#include <sys/lx_misc.h>

static int lx_socket(ulong_t *);
static int lx_bind(ulong_t *);
static int lx_connect(ulong_t *);
static int lx_listen(ulong_t *);
static int lx_accept(ulong_t *);
static int lx_getsockname(ulong_t *);
static int lx_getpeername(ulong_t *);
static int lx_socketpair(ulong_t *);
static int lx_send(ulong_t *);
static int lx_recv(ulong_t *);
static int lx_sendto(ulong_t *);
static int lx_recvfrom(ulong_t *);
static int lx_shutdown(ulong_t *);
static int lx_setsockopt(ulong_t *);
static int lx_getsockopt(ulong_t *);
static int lx_sendmsg(ulong_t *);
static int lx_recvmsg(ulong_t *);

typedef int (*sockfn_t)(ulong_t *);

static struct {
	sockfn_t s_fn;	/* Function implementing the subcommand */
	int s_nargs;	/* Number of arguments the function takes */
} sockfns[] = {
	lx_socket, 3,
	lx_bind, 3,
	lx_connect, 3,
	lx_listen, 2,
	lx_accept, 3,
	lx_getsockname, 3,
	lx_getpeername, 3,
	lx_socketpair, 4,
	lx_send, 4,
	lx_recv, 4,
	lx_sendto, 6,
	lx_recvfrom, 6,
	lx_shutdown, 2,
	lx_setsockopt, 5,
	lx_getsockopt, 5,
	lx_sendmsg, 3,
	lx_recvmsg, 3
};

/*
 * What follows are a series of tables we use to translate Linux constants
 * into equivalent Solaris constants and back again.  I wish this were
 * cleaner, more programmatic, and generally nicer.  Sadly, life is messy,
 * and Unix networking even more so.
 */
static const int ltos_family[LX_AF_MAX + 1] =  {
	AF_UNSPEC, AF_UNIX, AF_INET, AF_CCITT, AF_IPX,
	AF_APPLETALK, AF_NOTSUPPORTED, AF_OSI, AF_NOTSUPPORTED,
	AF_X25, AF_INET6, AF_CCITT, AF_DECnet,
	AF_802, AF_POLICY, AF_KEY, AF_ROUTE,
	AF_NOTSUPPORTED, AF_NOTSUPPORTED, AF_NOTSUPPORTED, AF_NOTSUPPORTED,
	AF_NOTSUPPORTED, AF_SNA, AF_NOTSUPPORTED, AF_NOTSUPPORTED,
	AF_NOTSUPPORTED, AF_NOTSUPPORTED, AF_NOTSUPPORTED, AF_NOTSUPPORTED,
	AF_NOTSUPPORTED, AF_NOTSUPPORTED, AF_NOTSUPPORTED, AF_NOTSUPPORTED
};

#define	LTOS_FAMILY(d) ((d) <= LX_AF_MAX ? ltos_family[(d)] : AF_INVAL)

static const int ltos_socktype[LX_SOCK_PACKET + 1] = {
	SOCK_NOTSUPPORTED, SOCK_STREAM, SOCK_DGRAM, SOCK_RAW,
	SOCK_RDM, SOCK_SEQPACKET, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED,
	SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED
};

#define	LTOS_SOCKTYPE(t)	\
	((t) <= LX_SOCK_PACKET ? ltos_socktype[(t)] : SOCK_INVAL)

/*
 * Linux socket option type definitions
 *
 * The protocol `levels` are well defined (see in.h) The option values are
 * not so well defined. Linux often uses different values to Solaris
 * although they mean the same thing. For example, IP_TOS in Linux is
 * defined as value 1 but in Solaris it is defined as value 3. This table
 * maps all the Protocol levels to their options and maps them between
 * Linux and Solaris and vice versa.  Hence the reason for the complexity.
 */

typedef struct lx_proto_opts {
	const int *proto;	/* Linux to Solaris mapping table */
	int maxentries;		/* max entries in this table */
} lx_proto_opts_t;

#define	OPTNOTSUP	-1	/* we don't support it */

static const int ltos_ip_sockopts[LX_IP_DROP_MEMBERSHIP + 1] = {
	OPTNOTSUP, IP_TOS, IP_TTL, IP_HDRINCL,
	IP_OPTIONS, OPTNOTSUP, IP_RECVOPTS, IP_RETOPTS,
	OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
	IP_RECVTTL, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
	OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
	OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
	OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
	OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
	IP_MULTICAST_IF, IP_MULTICAST_TTL, IP_MULTICAST_LOOP,
	IP_ADD_MEMBERSHIP, IP_DROP_MEMBERSHIP
};

static const int ltos_tcp_sockopts[LX_TCP_QUICKACK + 1] = {
	OPTNOTSUP, TCP_NODELAY, TCP_MAXSEG, OPTNOTSUP,
	OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
	TCP_KEEPALIVE, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
	OPTNOTSUP
};

static const int ltos_igmp_sockopts[IGMP_MTRACE + 1] = {
	OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
	OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
	IGMP_MINLEN, OPTNOTSUP, OPTNOTSUP, /* XXX: was IGMP_TIMER_SCALE */
	OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
	OPTNOTSUP, OPTNOTSUP, IGMP_MEMBERSHIP_QUERY,
	IGMP_V1_MEMBERSHIP_REPORT, IGMP_DVMRP,
	IGMP_PIM, OPTNOTSUP, IGMP_V2_MEMBERSHIP_REPORT,
	IGMP_V2_LEAVE_GROUP, OPTNOTSUP, OPTNOTSUP,
	OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
	IGMP_MTRACE_RESP, IGMP_MTRACE
};

static const int ltos_socket_sockopts[LX_SO_ACCEPTCONN + 1] = {
	OPTNOTSUP,	SO_DEBUG,	SO_REUSEADDR,   SO_TYPE,
	SO_ERROR,	SO_DONTROUTE,   SO_BROADCAST,   SO_SNDBUF,
	SO_RCVBUF,	SO_KEEPALIVE,   SO_OOBINLINE,   OPTNOTSUP,
	OPTNOTSUP,	SO_LINGER,	OPTNOTSUP,	OPTNOTSUP,
	OPTNOTSUP,	OPTNOTSUP,	SO_RCVLOWAT,    SO_SNDLOWAT,
	SO_RCVTIMEO,    SO_SNDTIMEO,    OPTNOTSUP,	OPTNOTSUP,
	OPTNOTSUP,	OPTNOTSUP,	OPTNOTSUP,	OPTNOTSUP,
	OPTNOTSUP,	OPTNOTSUP,	SO_ACCEPTCONN
};

#define	PROTO_SOCKOPTS(opts)    \
	{ (opts), sizeof ((opts)) / sizeof ((opts)[0]) }

/*
 * The main Linux to Solaris protocol to options mapping table
 * IPPROTO_TAB_SIZE can be set up to IPPROTO_MAX. All entries above
 * IPPROTO_TAB_SIZE are in effect not implemented,
 */

#define	IPPROTO_TAB_SIZE	8

static const lx_proto_opts_t ltos_proto_opts[IPPROTO_TAB_SIZE] = {
	/* IPPROTO_IP		0 */
	PROTO_SOCKOPTS(ltos_ip_sockopts),
	/* SOL_SOCKET		1 */
	PROTO_SOCKOPTS(ltos_socket_sockopts),
	/* IPPROTO_IGMP		2 */
	PROTO_SOCKOPTS(ltos_igmp_sockopts),
	/* NOT IMPLEMENTED	3 */
	{ NULL, 0 },
	/* NOT IMPLEMENTED	4 */
	{ NULL, 0 },
	/* NOT IMPLEMENTED	5 */
	{ NULL, 0 },
	/* IPPROTO_TCP		6 */
	PROTO_SOCKOPTS(ltos_tcp_sockopts),
	/* NOT IMPLEMENTED	7 */
	{ NULL, 0 }
};

/*
 * Lifted from socket.h, since these definitions are contained within
 * _KERNEL guards.
 */
#define	_CMSG_HDR_ALIGNMENT	4
#define	_CMSG_HDR_ALIGN(x)	(((uintptr_t)(x) + _CMSG_HDR_ALIGNMENT - 1) & \
				    ~(_CMSG_HDR_ALIGNMENT - 1))
#define	CMSG_FIRSTHDR(m)						\
	(((m)->msg_controllen < sizeof (struct cmsghdr)) ?		\
	    (struct cmsghdr *)0 : (struct cmsghdr *)((m)->msg_control))

#define	CMSG_NXTHDR(m, c)						\
	(((c) == 0) ? CMSG_FIRSTHDR(m) :			\
	((((uintptr_t)_CMSG_HDR_ALIGN((char *)(c) +			\
	((struct cmsghdr *)(c))->cmsg_len) + sizeof (struct cmsghdr)) >	\
	(((uintptr_t)((struct lx_msghdr *)(m))->msg_control) +		\
	((uintptr_t)((struct lx_msghdr *)(m))->msg_controllen))) ?	\
	((struct cmsghdr *)0) :						\
	((struct cmsghdr *)_CMSG_HDR_ALIGN((char *)(c) +		\
	    ((struct cmsghdr *)(c))->cmsg_len))))

#define	LX_TO_SOL	1
#define	SOL_TO_LX	2

static int
convert_cmsgs(int direction, struct lx_msghdr *msg, char *caller)
{
	struct cmsghdr *cmsg, *last;
	int err = 0;

	cmsg = CMSG_FIRSTHDR(msg);
	while (cmsg != NULL && err == 0) {
		if (direction == LX_TO_SOL) {
			if (cmsg->cmsg_level == LX_SOL_SOCKET) {
				cmsg->cmsg_level = SOL_SOCKET;
				if (cmsg->cmsg_type == LX_SCM_RIGHTS)
					cmsg->cmsg_type = SCM_RIGHTS;
				else if (cmsg->cmsg_type == LX_SCM_CRED)
					cmsg->cmsg_type = SCM_UCRED;
				else
					err = ENOTSUP;
			} else {
				err = ENOTSUP;
			}
		} else {
			if (cmsg->cmsg_level == SOL_SOCKET) {
				cmsg->cmsg_level = LX_SOL_SOCKET;
				if (cmsg->cmsg_type == SCM_RIGHTS)
					cmsg->cmsg_type = LX_SCM_RIGHTS;
				else if (cmsg->cmsg_type == SCM_UCRED)
					cmsg->cmsg_type = LX_SCM_CRED;
				else
					err = ENOTSUP;
			} else {
				err = ENOTSUP;
			}
		}

		last = cmsg;
		cmsg = CMSG_NXTHDR(msg, last);
	}
	if (err)
		lx_unsupported("Unsupported socket control message in %s\n.",
		    caller);

	return (err);
}

static int
convert_sockaddr(struct sockaddr *addr, socklen_t *len,
	struct sockaddr *inaddr, socklen_t inlen)
{
	sa_family_t family;
	int lx_in6_len;
	int size;

	/*
	 * Note that if the buffer at inaddr is ever smaller than inlen bytes,
	 * we may erroneously return EFAULT rather than a possible EINVAL
	 * as the copy comes before the various checks as to whether inlen
	 * is of the proper length for the socket type.
	 *
	 * This isn't an issue at present because all callers to this routine
	 * do meet that constraint.
	 */
	if ((ssize_t)inlen < 0)
		return (-EINVAL);
	if (uucopy(inaddr, addr, inlen) != 0)
		return (-errno);

	family = LTOS_FAMILY(addr->sa_family);

	switch (family) {
		case (sa_family_t)AF_NOTSUPPORTED:
			return (-EPROTONOSUPPORT);
		case (sa_family_t)AF_INVAL:
			return (-EAFNOSUPPORT);
		case AF_INET:
			size = sizeof (struct sockaddr);

			if (inlen < size)
				return (-EINVAL);

			*len = size;
			break;

		case AF_INET6:
			/*
			 * The Solaris sockaddr_in6 has one more 32-bit
			 * field than the Linux version.
			 */
			size = sizeof (struct sockaddr_in6);
			lx_in6_len = size - sizeof (uint32_t);

			if (inlen != lx_in6_len)
				return (-EINVAL);

			*len = (sizeof (struct sockaddr_in6));
			bzero((char *)addr + lx_in6_len, sizeof	(uint32_t));
			break;

		case AF_UNIX:
			if (inlen > sizeof (struct sockaddr_un))
				return (-EINVAL);

			*len = inlen;
			break;

		default:
			*len = inlen;
	}

	addr->sa_family = family;
	return (0);
}

static int
convert_sock_args(int in_dom, int in_type, int in_protocol, int *out_dom,
    int *out_type)
{
	int domain, type;

	if (in_dom < 0 || in_type < 0 || in_protocol < 0)
		return (-EINVAL);

	domain = LTOS_FAMILY(in_dom);
	if (domain == AF_NOTSUPPORTED || domain == AF_UNSPEC)
		return (-EAFNOSUPPORT);
	if (domain == AF_INVAL)
		return (-EINVAL);

	type = LTOS_SOCKTYPE(in_type);
	if (type == SOCK_NOTSUPPORTED)
		return (-ESOCKTNOSUPPORT);
	if (type == SOCK_INVAL)
		return (-EINVAL);

	/*
	 * Linux does not allow the app to specify IP Protocol for raw
	 * sockets.  Solaris does, so bail out here.
	 */
	if (type == SOCK_RAW && in_protocol == IPPROTO_IP)
		return (-ESOCKTNOSUPPORT);

	*out_dom = domain;
	*out_type = type;
	return (0);
}

static int
convert_sockflags(int lx_flags)
{
	int solaris_flags = 0;

	if (lx_flags & LX_MSG_OOB)
		solaris_flags |= MSG_OOB;

	if (lx_flags & LX_MSG_PEEK)
		solaris_flags |= MSG_PEEK;

	if (lx_flags & LX_MSG_DONTROUTE)
		solaris_flags |= MSG_DONTROUTE;

	if (lx_flags & LX_MSG_CTRUNC)
		solaris_flags |= MSG_CTRUNC;

	if (lx_flags & LX_MSG_TRUNC)
		solaris_flags |= MSG_TRUNC;

	if (lx_flags & LX_MSG_WAITALL)
		solaris_flags |= MSG_WAITALL;

	if (lx_flags & LX_MSG_DONTWAIT)
		solaris_flags |= MSG_DONTWAIT;

	if (lx_flags & LX_MSG_EOR)
		solaris_flags |= MSG_EOR;

	if (lx_flags & LX_MSG_PROXY)
		lx_unsupported("socket operation with MSG_PROXY flag set");

	if (lx_flags & LX_MSG_FIN)
		lx_unsupported("socket operation with MSG_FIN flag set");

	if (lx_flags & LX_MSG_SYN)
		lx_unsupported("socket operation with MSG_SYN flag set");

	if (lx_flags & LX_MSG_CONFIRM)
		lx_unsupported("socket operation with MSG_CONFIRM set");

	if (lx_flags & LX_MSG_RST)
		lx_unsupported("socket operation with MSG_RST flag set");

	if (lx_flags & LX_MSG_MORE)
		lx_unsupported("socket operation with MSG_MORE flag set");

	return (solaris_flags);
}

static int
lx_socket(ulong_t *args)
{
	int domain;
	int type;
	int protocol = (int)args[2];
	int fd;
	int err;

	err = convert_sock_args((int)args[0], (int)args[1], protocol,
	    &domain, &type);
	if (err != 0)
		return (err);

	lx_debug("\tsocket(%d, %d, %d)", domain, type, protocol);
	fd = socket(domain, type, protocol);
	if (fd >= 0)
		return (fd);

	if (errno == EPROTONOSUPPORT)
		return (-ESOCKTNOSUPPORT);

	return (-errno);
}

static int
lx_bind(ulong_t *args)
{
	int sockfd = (int)args[0];
	struct stat64 statbuf;
	struct sockaddr *name;
	socklen_t len;
	int r;

	if ((name = SAFE_ALLOCA((socklen_t)args[2])) == NULL)
		return (-EINVAL);
	if ((r = convert_sockaddr(name, &len, (struct sockaddr *)args[1],
	    (socklen_t)args[2])) < 0)
		return (r);

	lx_debug("\tbind(%d, 0x%p, %d)", sockfd, name, len);

	if (name->sa_family == AF_UNIX)
		lx_debug("\t\tAF_UNIX, path = %s", name->sa_data);

	r = bind(sockfd, name, len);

	/*
	 * Linux returns EADDRINUSE for attempts to bind to UNIX domain
	 * sockets that aren't sockets.
	 */
	if ((r < 0) && (errno == EINVAL) && (name->sa_family == AF_UNIX) &&
	    ((stat64(name->sa_data, &statbuf) == 0) &&
	    (!S_ISSOCK(statbuf.st_mode))))
		return (-EADDRINUSE);

	return ((r < 0) ? -errno : r);
}

static int
lx_connect(ulong_t *args)
{
	int sockfd = (int)args[0];
	struct sockaddr *name;
	socklen_t len;
	int r;

	if ((name = SAFE_ALLOCA((socklen_t)args[2])) == NULL)
		return (-EINVAL);

	if ((r = convert_sockaddr(name, &len, (struct sockaddr *)args[1],
	    (socklen_t)args[2])) < 0)
		return (r);

	lx_debug("\tconnect(%d, 0x%p, %d)", sockfd, name, len);

	if (name->sa_family == AF_UNIX)
		lx_debug("\t\tAF_UNIX, path = %s", name->sa_data);

	r = connect(sockfd, name, len);

	return ((r < 0) ? -errno : r);
}

static int
lx_listen(ulong_t *args)
{
	int sockfd = (int)args[0];
	int backlog = (int)args[1];
	int r;

	lx_debug("\tlisten(%d, %d)", sockfd, backlog);
	r = listen(sockfd, backlog);

	return ((r < 0) ? -errno : r);
}

static int
lx_accept(ulong_t *args)
{
	int sockfd = (int)args[0];
	struct sockaddr *name = (struct sockaddr *)args[1];
	socklen_t namelen = 0;
	int r;

	lx_debug("\taccept(%d, 0x%p, 0x%p", sockfd, args[1], args[2]);

	/*
	 * The Linux man page says that -1 is returned and errno is set to
	 * EFAULT if the "name" address is bad, but it is silent on what to
	 * set errno to if the "namelen" address is bad.  Experimentation
	 * shows that Linux (at least the 2.4.21 kernel in CentOS) actually
	 * sets errno to EINVAL in both cases.
	 *
	 * Note that we must first check the name pointer, as the Linux
	 * docs state nothing is copied out if the "name" pointer is NULL.
	 * If it is NULL, we don't care about the namelen pointer's value
	 * or about dereferencing it.
	 *
	 * Happily, Solaris' accept(3SOCKET) treats NULL name pointers and
	 * zero namelens the same way.
	 */
	if ((name != NULL) &&
	    (uucopy((void *)args[2], &namelen, sizeof (socklen_t)) != 0))
		return ((errno == EFAULT) ? -EINVAL : -errno);

	lx_debug("\taccept namelen = %d", namelen);

	if ((r = accept(sockfd, name, &namelen)) < 0)
		return ((errno == EFAULT) ? -EINVAL : -errno);

	lx_debug("\taccept namelen returned %d bytes", namelen);

	/*
	 * In Linux, accept()ed sockets do not inherit anything set by
	 * fcntl(), so filter those out.
	 */
	if (fcntl(r, F_SETFL, 0) < 0)
		return (-errno);

	/*
	 * Once again, a bad "namelen" address sets errno to EINVAL, not
	 * EFAULT.  If namelen was zero, there's no need to copy a zero back
	 * out.
	 *
	 * Logic might dictate that we should check if we can write to
	 * the namelen pointer earlier so we don't accept a pending connection
	 * only to fail the call because we can't write the namelen value back
	 * out. However, testing shows Linux does indeed fail the call after
	 * accepting the connection so we must behave in a compatible manner.
	 */
	if ((name != NULL) && (namelen != 0) &&
	    (uucopy(&namelen, (void *)args[2], sizeof (socklen_t)) != 0))
		return ((errno == EFAULT) ? -EINVAL : -errno);

	return (r);
}

static int
lx_getsockname(ulong_t *args)
{
	int sockfd = (int)args[0];
	struct sockaddr *name = NULL;
	socklen_t namelen, namelen_orig;

	if (uucopy((void *)args[2], &namelen, sizeof (socklen_t)) != 0)
		return (-errno);
	namelen_orig = namelen;

	lx_debug("\tgetsockname(%d, 0x%p, 0x%p (=%d))",
	    sockfd, args[1], args[2], namelen);

	if (namelen > 0) {
		if ((name = SAFE_ALLOCA(namelen)) == NULL)
			return (-EINVAL);
		bzero(name, namelen);
	}

	if ((getsockname(sockfd, name, &namelen)) < 0)
		return (-errno);

	/*
	 * If the name that getsockname() want's to return is larger
	 * than namelen, getsockname() will copy out the maximum amount
	 * of data possible and then update namelen to indicate the
	 * actually size of all the data that it wanted to copy out.
	 */
	if (uucopy(name, (void *)args[1], namelen_orig) != 0)
		return (-errno);
	if (uucopy(&namelen, (void *)args[2], sizeof (socklen_t)) != 0)
		return (-errno);

	return (0);
}

static int
lx_getpeername(ulong_t *args)
{
	int sockfd = (int)args[0];
	struct sockaddr *name;
	socklen_t namelen;

	if (uucopy((void *)args[2], &namelen, sizeof (socklen_t)) != 0)
		return (-errno);

	lx_debug("\tgetpeername(%d, 0x%p, 0x%p (=%d))",
	    sockfd, args[1], args[2], namelen);

	/*
	 * Linux returns EFAULT in this case, even if the namelen parameter
	 * is 0.  This check will not catch other illegal addresses, but
	 * the benefit catching a non-null illegal address here is not
	 * worth the cost of another system call.
	 */
	if ((void *)args[1] == NULL)
		return (-EFAULT);

	if ((name = SAFE_ALLOCA(namelen)) == NULL)
		return (-EINVAL);
	if ((getpeername(sockfd, name, &namelen)) < 0)
		return (-errno);

	if (uucopy(name, (void *)args[1], namelen) != 0)
		return (-errno);

	if (uucopy(&namelen, (void *)args[2], sizeof (socklen_t)) != 0)
		return (-errno);

	return (0);
}

static int
lx_socketpair(ulong_t *args)
{
	int domain;
	int type;
	int protocol = (int)args[2];
	int *sv = (int *)args[3];
	int fds[2];
	int r;

	r = convert_sock_args((int)args[0], (int)args[1], protocol,
	    &domain, &type);
	if (r != 0)
		return (r);

	lx_debug("\tsocketpair(%d, %d, %d, 0x%p)", domain, type, protocol, sv);

	r = socketpair(domain, type, protocol, fds);

	if (r == 0) {
		if (uucopy(fds, sv, sizeof (fds)) != 0) {
			r = errno;
			(void) close(fds[0]);
			(void) close(fds[1]);
			return (-r);
		}
		return (0);
	}

	if (errno == EPROTONOSUPPORT)
		return (-ESOCKTNOSUPPORT);

	return (-errno);
}

static ssize_t
lx_send(ulong_t *args)
{
	int sockfd = (int)args[0];
	void *buf = (void *)args[1];
	size_t len = (size_t)args[2];
	int flags = (int)args[3];
	ssize_t r;

	int nosigpipe = flags & LX_MSG_NOSIGNAL;
	struct sigaction newact, oact;

	lx_debug("\tsend(%d, 0x%p, 0x%d, 0x%x)", sockfd, buf, len, flags);

	flags = convert_sockflags(flags);

	/*
	 * If nosigpipe is set, we want to emulate the Linux action of
	 * not sending a SIGPIPE to the caller if the remote socket has
	 * already been closed.
	 *
	 * As SIGPIPE is a directed signal sent only to the thread that
	 * performed the action, we can emulate this behavior by momentarily
	 * resetting the action for SIGPIPE to SIG_IGN, performing the socket
	 * call, and resetting the action back to its previous value.
	 */
	if (nosigpipe) {
	    newact.sa_handler = SIG_IGN;
	    newact.sa_flags = 0;
	    (void) sigemptyset(&newact.sa_mask);

	    if (sigaction(SIGPIPE, &newact, &oact) < 0)
		    lx_err_fatal(gettext(
			"%s: could not ignore SIGPIPE to emulate "
			"LX_MSG_NOSIGNAL"), "send()");
	}

	r = send(sockfd, buf, len, flags);

	if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
		lx_err_fatal(
		    gettext("%s: could not reset SIGPIPE handler to "
		    "emulate LX_MSG_NOSIGNAL"), "send()");

	return ((r < 0) ? -errno : r);
}

static ssize_t
lx_recv(ulong_t *args)
{
	int sockfd = (int)args[0];
	void *buf = (void *)args[1];
	size_t len = (size_t)args[2];
	int flags = (int)args[3];
	ssize_t r;

	int nosigpipe = flags & LX_MSG_NOSIGNAL;
	struct sigaction newact, oact;

	lx_debug("\trecv(%d, 0x%p, 0x%d, 0x%x)", sockfd, buf, len, flags);

	flags = convert_sockflags(flags);

	/*
	 * If nosigpipe is set, we want to emulate the Linux action of
	 * not sending a SIGPIPE to the caller if the remote socket has
	 * already been closed.
	 *
	 * As SIGPIPE is a directed signal sent only to the thread that
	 * performed the action, we can emulate this behavior by momentarily
	 * resetting the action for SIGPIPE to SIG_IGN, performing the socket
	 * call, and resetting the action back to its previous value.
	 */
	if (nosigpipe) {
	    newact.sa_handler = SIG_IGN;
	    newact.sa_flags = 0;
	    (void) sigemptyset(&newact.sa_mask);

	    if (sigaction(SIGPIPE, &newact, &oact) < 0)
		    lx_err_fatal(gettext(
			"%s: could not ignore SIGPIPE to emulate "
			"LX_MSG_NOSIGNAL"), "recv()");
	}

	r = recv(sockfd, buf, len, flags);

	if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
		lx_err_fatal(
		    gettext("%s: could not reset SIGPIPE handler to "
		    "emulate LX_MSG_NOSIGNAL"), "recv()");

	return ((r < 0) ? -errno : r);
}

static ssize_t
lx_sendto(ulong_t *args)
{
	int sockfd = (int)args[0];
	void *buf = (void *)args[1];
	size_t len = (size_t)args[2];
	int flags = (int)args[3];
	struct sockaddr *to;
	socklen_t tolen;
	ssize_t r;

	int nosigpipe = flags & LX_MSG_NOSIGNAL;
	struct sigaction newact, oact;

	if ((to = SAFE_ALLOCA((socklen_t)args[5])) == NULL)
		return (-EINVAL);

	if ((r = convert_sockaddr(to, &tolen, (struct sockaddr *)args[4],
	    (socklen_t)args[5])) < 0)
		return (r);

	lx_debug("\tsendto(%d, 0x%p, 0x%d, 0x%x, 0x%x, %d)", sockfd, buf, len,
	    flags, to, tolen);

	flags = convert_sockflags(flags);

	/*
	 * If nosigpipe is set, we want to emulate the Linux action of
	 * not sending a SIGPIPE to the caller if the remote socket has
	 * already been closed.
	 *
	 * As SIGPIPE is a directed signal sent only to the thread that
	 * performed the action, we can emulate this behavior by momentarily
	 * resetting the action for SIGPIPE to SIG_IGN, performing the socket
	 * call, and resetting the action back to its previous value.
	 */
	if (nosigpipe) {
	    newact.sa_handler = SIG_IGN;
	    newact.sa_flags = 0;
	    (void) sigemptyset(&newact.sa_mask);

	    if (sigaction(SIGPIPE, &newact, &oact) < 0)
		    lx_err_fatal(gettext(
			"%s: could not ignore SIGPIPE to emulate "
			"LX_MSG_NOSIGNAL"), "sendto()");
	}

	r = sendto(sockfd, buf, len, flags, to, tolen);

	if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
		lx_err_fatal(
		    gettext("%s: could not reset SIGPIPE handler to "
		    "emulate LX_MSG_NOSIGNAL"), "sendto()");

	if (r < 0) {
		/*
		 * according to the man page and LTP, the expected error in
		 * this case is EPIPE.
		 */
		if (errno == ENOTCONN)
			return (-EPIPE);
		else
			return (-errno);
	}
	return (r);
}

static ssize_t
lx_recvfrom(ulong_t *args)
{
	int sockfd = (int)args[0];
	void *buf = (void *)args[1];
	size_t len = (size_t)args[2];
	int flags = (int)args[3];
	struct sockaddr *from = (struct sockaddr *)args[4];
	socklen_t *from_lenp = (socklen_t *)args[5];
	ssize_t r;

	int nosigpipe = flags & LX_MSG_NOSIGNAL;
	struct sigaction newact, oact;

	lx_debug("\trecvfrom(%d, 0x%p, 0x%d, 0x%x, 0x%x, 0x%p)", sockfd, buf,
	    len, flags, from, from_lenp);

	flags = convert_sockflags(flags);

	/*
	 * If nosigpipe is set, we want to emulate the Linux action of
	 * not sending a SIGPIPE to the caller if the remote socket has
	 * already been closed.
	 *
	 * As SIGPIPE is a directed signal sent only to the thread that
	 * performed the action, we can emulate this behavior by momentarily
	 * resetting the action for SIGPIPE to SIG_IGN, performing the socket
	 * call, and resetting the action back to its previous value.
	 */
	if (nosigpipe) {
	    newact.sa_handler = SIG_IGN;
	    newact.sa_flags = 0;
	    (void) sigemptyset(&newact.sa_mask);

	    if (sigaction(SIGPIPE, &newact, &oact) < 0)
		    lx_err_fatal(gettext(
			"%s: could not ignore SIGPIPE to emulate "
			"LX_MSG_NOSIGNAL"), "recvfrom()");
	}

	r = recvfrom(sockfd, buf, len, flags, from, from_lenp);

	if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
		lx_err_fatal(
		    gettext("%s: could not reset SIGPIPE handler to "
		    "emulate LX_MSG_NOSIGNAL"), "recvfrom()");

	return ((r < 0) ? -errno : r);
}

static int
lx_shutdown(ulong_t *args)
{
	int sockfd = (int)args[0];
	int how = (int)args[1];
	int r;

	lx_debug("\tshutdown(%d, %d)", sockfd, how);
	r = shutdown(sockfd, how);

	return ((r < 0) ? -errno : r);
}

static int
lx_setsockopt(ulong_t *args)
{
	int sockfd = (int)args[0];
	int level = (int)args[1];
	int optname = (int)args[2];
	void *optval = (void *)args[3];
	int optlen = (int)args[4];
	int internal_opt;
	int r;

	lx_debug("\tsetsockopt(%d, %d, %d, 0x%p, %d)", sockfd, level, optname,
	    optval, optlen);

	/*
	 * The kernel returns EFAULT for all invalid addresses except NULL,
	 * for which it returns EINVAL.  Linux wants EFAULT for NULL too.
	 */
	if (optval == NULL)
		return (-EFAULT);

	/*
	 * Do a table lookup of the Solaris equivalent of the given option
	 */
	if (level < IPPROTO_IP || level >= IPPROTO_TAB_SIZE)
		return (-ENOPROTOOPT);

	if (ltos_proto_opts[level].maxentries == 0 ||
	    optname <= 0 || optname >= (ltos_proto_opts[level].maxentries))
		return (-ENOPROTOOPT);

	if (optname == LX_TCP_CORK) {
		/*
		 * TCP_CORK is a Linux-only option that instructs the TCP
		 * stack not to send out partial frames.  Solaris doesn't
		 * include this option but some apps require it.  So, we do
		 * our best to emulate the option by disabling TCP_NODELAY.
		 * If the app requests that we disable TCP_CORK, we just
		 * ignore it since enabling TCP_NODELAY may be
		 * overcompensating.
		 */
		optname = TCP_NODELAY;
		if (optlen != sizeof (int))
			return (-EINVAL);
		if (uucopy(optval, &internal_opt, sizeof (int)) != 0)
			return (-errno);
		if (internal_opt == 0)
			return (0);
		internal_opt = 1;
		optval = &internal_opt;
	} else {
		optname = ltos_proto_opts[level].proto[optname];

		if (optname == OPTNOTSUP)
			return (-ENOPROTOOPT);
	}

	if (level == LX_SOL_SOCKET)
		level = SOL_SOCKET;

	r = setsockopt(sockfd, level, optname, optval, optlen);

	return ((r < 0) ? -errno : r);
}

static int
lx_getsockopt(ulong_t *args)
{
	int sockfd = (int)args[0];
	int level = (int)args[1];
	int optname = (int)args[2];
	void *optval = (void *)args[3];
	int *optlen = (int *)args[4];
	int r;

	lx_debug("\tgetsockopt(%d, %d, %d, 0x%p, 0x%p)", sockfd, level, optname,
	    optval, optlen);

	/*
	 * According to the Linux man page, a NULL optval should indicate
	 * (as in Solaris) that no return value is expected.  Instead, it
	 * actually triggers an EFAULT error.
	 */
	if (optval == NULL)
		return (-EFAULT);

	/*
	 * Do a table lookup of the Solaris equivalent of the given option
	 */
	if (level < IPPROTO_IP || level >= IPPROTO_TAB_SIZE)
		return (-EOPNOTSUPP);

	if (ltos_proto_opts[level].maxentries == 0 ||
	    optname <= 0 || optname >= (ltos_proto_opts[level].maxentries))
		return (-ENOPROTOOPT);

	if (optname == LX_TCP_CORK) {
		/*
		 * We don't support TCP_CORK but some apps rely on it.  So,
		 * rather than return an error we just return 0.  This
		 * isn't exactly a lie, since this option really isn't set,
		 * but it's not the whole truth either.  Fortunately, we
		 * aren't under oath.
		 */
		r = 0;
		if (uucopy(&r, optval, sizeof (int)) != 0)
			return (-errno);
		r = sizeof (int);
		if (uucopy(&r, optlen, sizeof (int)) != 0)
			return (-errno);
		return (0);
	}

	optname = ltos_proto_opts[level].proto[optname];

	if (optname == OPTNOTSUP)
		return (-ENOPROTOOPT);

	if (level == LX_SOL_SOCKET)
		level = SOL_SOCKET;

	r = getsockopt(sockfd, level, optname, optval, optlen);

	return ((r < 0) ? -errno : r);
}

/*
 * libc routines that issue these system calls.  We bypass the libsocket
 * wrappers since they explicitly turn off the MSG_XPG_2 flag we need for
 * Linux compatibility.
 */
extern int _so_sendmsg();
extern int _so_recvmsg();

static int
lx_sendmsg(ulong_t *args)
{
	int sockfd = (int)args[0];
	struct lx_msghdr msg;
	struct cmsghdr *cmsg;
	int flags = (int)args[2];
	int r;

	int nosigpipe = flags & LX_MSG_NOSIGNAL;
	struct sigaction newact, oact;

	lx_debug("\tsendmsg(%d, 0x%p, 0x%x)", sockfd, (void *)args[1], flags);

	flags = convert_sockflags(flags);

	if ((uucopy((void *)args[1], &msg, sizeof (msg))) != 0)
		return (-errno);

	/*
	 * If there are control messages bundled in this message, we need
	 * to convert them from Linux to Solaris.
	 */
	if (msg.msg_control != NULL) {
		if (msg.msg_controllen == 0) {
			cmsg = NULL;
		} else {
			cmsg = SAFE_ALLOCA(msg.msg_controllen);
			if (cmsg == NULL)
				return (-EINVAL);
		}
		if ((uucopy(msg.msg_control, cmsg, msg.msg_controllen)) != 0)
			return (-errno);
		msg.msg_control = cmsg;
		if ((r = convert_cmsgs(LX_TO_SOL, &msg, "sendmsg()")) != 0)
			return (-r);
	}

	/*
	 * If nosigpipe is set, we want to emulate the Linux action of
	 * not sending a SIGPIPE to the caller if the remote socket has
	 * already been closed.
	 *
	 * As SIGPIPE is a directed signal sent only to the thread that
	 * performed the action, we can emulate this behavior by momentarily
	 * resetting the action for SIGPIPE to SIG_IGN, performing the socket
	 * call, and resetting the action back to its previous value.
	 */
	if (nosigpipe) {
	    newact.sa_handler = SIG_IGN;
	    newact.sa_flags = 0;
	    (void) sigemptyset(&newact.sa_mask);

	    if (sigaction(SIGPIPE, &newact, &oact) < 0)
		    lx_err_fatal(gettext(
			"%s: could not ignore SIGPIPE to emulate "
			"LX_MSG_NOSIGNAL"), "sendmsg()");
	}

	r = _so_sendmsg(sockfd, (struct msghdr *)&msg, flags | MSG_XPG4_2);

	if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
		lx_err_fatal(
		    gettext("%s: could not reset SIGPIPE handler to "
		    "emulate LX_MSG_NOSIGNAL"), "sendmsg()");

	if (r < 0) {
		/*
		 * according to the man page and LTP, the expected error in
		 * this case is EPIPE.
		 */
		if (errno == ENOTCONN)
			return (-EPIPE);
		else
			return (-errno);
	}

	return (r);
}

static int
lx_recvmsg(ulong_t *args)
{
	int sockfd = (int)args[0];
	struct lx_msghdr msg;
	struct lx_msghdr *msgp = (struct lx_msghdr *)args[1];
	struct cmsghdr *cmsg;
	int flags = (int)args[2];
	int r, err;

	int nosigpipe = flags & LX_MSG_NOSIGNAL;
	struct sigaction newact, oact;

	lx_debug("\trecvmsg(%d, 0x%p, 0x%x)", sockfd, (void *)args[1], flags);

	flags = convert_sockflags(flags);

	if ((uucopy(msgp, &msg, sizeof (msg))) != 0)
		return (-errno);

	/*
	 * If we are expecting to have to convert any control messages,
	 * then we should receive them into our address space instead of
	 * the app's.
	 */
	if (msg.msg_control != NULL) {
		cmsg = msg.msg_control;
		if (msg.msg_controllen == 0) {
			msg.msg_control = NULL;
		} else {
			msg.msg_control = SAFE_ALLOCA(msg.msg_controllen);
			if (msg.msg_control == NULL)
				return (-EINVAL);
		}
	}

	/*
	 * If nosigpipe is set, we want to emulate the Linux action of
	 * not sending a SIGPIPE to the caller if the remote socket has
	 * already been closed.
	 *
	 * As SIGPIPE is a directed signal sent only to the thread that
	 * performed the action, we can emulate this behavior by momentarily
	 * resetting the action for SIGPIPE to SIG_IGN, performing the socket
	 * call, and resetting the action back to its previous value.
	 */
	if (nosigpipe) {
	    newact.sa_handler = SIG_IGN;
	    newact.sa_flags = 0;
	    (void) sigemptyset(&newact.sa_mask);

	    if (sigaction(SIGPIPE, &newact, &oact) < 0)
		    lx_err_fatal(gettext(
			"%s: could not ignore SIGPIPE to emulate "
			"LX_MSG_NOSIGNAL"), "recvmsg()");
	}

	r = _so_recvmsg(sockfd, (struct msghdr *)&msg, flags | MSG_XPG4_2);

	if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
		lx_err_fatal(
		    gettext("%s: could not reset SIGPIPE handler to "
		    "emulate LX_MSG_NOSIGNAL"), "recvmsg()");

	if (r >= 0 && msg.msg_control != NULL) {
		/*
		 * If there are control messages bundled in this message,
		 * we need to convert them from Linux to Solaris.
		 */
		if ((err = convert_cmsgs(SOL_TO_LX, &msg, "recvmsg()")) != 0)
			return (-err);

		if ((uucopy(msg.msg_control, cmsg, msg.msg_controllen)) != 0)
			return (-errno);
	}

	/*
	 * A handful of the values in the msghdr are set by the recvmsg()
	 * call, so copy their values back to the caller.  Rather than iterate,
	 * just copy the whole structure back.
	 */
	if (uucopy(&msg, msgp, sizeof (msg)) != 0)
		return (-errno);

	return ((r < 0) ? -errno : r);
}

int
lx_socketcall(uintptr_t p1, uintptr_t p2)
{
	int subcmd = (int)p1 - 1; /* subcommands start at 1 - not 0 */
	ulong_t args[6];
	int r;

	if (subcmd < 0 || subcmd >= LX_RECVMSG)
		return (-EINVAL);

	/*
	 * Copy the arguments to the subcommand in from the app's address
	 * space, returning EFAULT if we get a bogus pointer.
	 */
	if (uucopy((void *)p2, args,
	    sockfns[subcmd].s_nargs * sizeof (ulong_t)))
		return (-errno);

	r = (sockfns[subcmd].s_fn)(args);

	return (r);
}