socket.c revision ec5347e2c775f027573ce5648b910361aa926c01
a134177ed9f82189504191d90f3ed9e97c2b47cbTinderbox User * Copyright (C) 2004-2007 Internet Systems Consortium, Inc. ("ISC")
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * Copyright (C) 2000-2003 Internet Software Consortium.
0c27b3fe77ac1d5094ba3521e8142d9e7973133fMark Andrews * Permission to use, copy, modify, and/or distribute this software for any
0c27b3fe77ac1d5094ba3521e8142d9e7973133fMark Andrews * purpose with or without fee is hereby granted, provided that the above
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * copyright notice and this permission notice appear in all copies.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * PERFORMANCE OF THIS SOFTWARE.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews/* $Id: socket.c,v 1.51 2007/06/18 23:47:49 tbox Exp $ */
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews/* This code has been rewritten to take advantage of Windows Sockets
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * I/O Completion Ports and Events. I/O Completion Ports is ONLY
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * available on Windows NT, Windows 2000 and Windows XP series of
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * the Windows Operating Systems. In CANNOT run on Windows 95, Windows 98
b0ba1a6059b6d6c4b3aa77d8bc84cc443b981e01Mukund Sivaraman * or the follow-ons to those Systems.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * This code is by nature multithreaded and takes advantage of various
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * features to pass on information through the completion port for
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * when I/O is completed. All sends and receives are completed through
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * the completion port. Due to an implementation bug in Windows 2000,
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * Service Pack 2 must installed on the system for this code to run correctly.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * For details on this problem see Knowledge base article Q263823.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * The code checks for this. The number of Completion Port Worker threads
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * used is the total number of CPU's + 1. This increases the likelihood that
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * a Worker Thread is available for processing a completed request.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * All accepts and connects are accomplished through the WSAEventSelect()
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * function and the event_wait loop. Events are added to and deleted from
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * each event_wait thread via a common event_update stack owned by the socket
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * manager. If the event_wait thread runs out of array space in the events
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * array it will look for another event_wait thread to add the event. If it
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * fails to find another one it will create a new thread to handle the
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * outstanding event.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * A future enhancement is to use AcceptEx to take avantage of Overlapped
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * I/O which allows for enhanced performance of TCP connections.
e851ea826066ac5a5b01c2c23218faa0273a12e8Evan Hunt * This will also reduce the number of events that are waited on by the
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * event_wait threads to just the connect sockets and reduce the number
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * additional threads required.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * XXXPDM 5 August, 2002
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews#define _WINSOCKAPI_ /* Prevent inclusion of winsock.h in windows.h */
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * Define this macro to control the behavior of connection
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * resets on UDP sockets. See Microsoft KnowledgeBase Article Q263823
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * for details.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * NOTE: This requires that Windows 2000 systems install Service Pack 2
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews#define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
b0ba1a6059b6d6c4b3aa77d8bc84cc443b981e01Mukund Sivaraman * Some systems define the socket length argument as an int, some as size_t,
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * some as socklen_t. This is here so it can be easily changed if needed.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews#define ISC_SOCKADDR_LEN_T unsigned int
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * Define what the possible "soft" errors can be. These are non-fatal returns
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * of various network related functions, like recv() and so on.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * Pending errors are not really errors and should be
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * kept separate
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews#define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews#define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews#define DOIO_PENDING 4 /* status when i/o is in process */
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
b0ba1a6059b6d6c4b3aa77d8bc84cc443b981e01Mukund Sivaraman * DLVL(90) -- Function entry/exit and other tracing.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * DLVL(70) -- Socket "correctness" -- including returning of events, etc.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * DLVL(60) -- Socket data send/receive
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * DLVL(50) -- Event tracing, including receiving/sending completion events.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * DLVL(20) -- Socket creation/destruction.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews#define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o')
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews#define VALID_SOCKET(t) ISC_MAGIC_VALID(t, SOCKET_MAGIC)
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * IPv6 control information. If the socket is an IPv6 socket we want
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * to collect the destination address and interface so the client can
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * set them on outgoing packets.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * We really don't want to try and use these control messages. Win32
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * doesn't have this mechanism before XP.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * Message header for recvmsg and sendmsg calls.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * Used value-result for recvmsg, value only for sendmsg.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews void *msg_control; /* ancillary data, see below */
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews u_int msg_controllen; /* ancillary data buffer len */
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews int msg_totallen; /* total length of this message */
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * The size to raise the recieve buffer to.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * The number of times a send operation is repeated if the result
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews * is WSAEINTR.
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews /* Not locked. */
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews unsigned int magic;
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews /* Pointers to scatter/gather buffers */
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews DWORD evthread_id; /* Event Thread Id for socket */
f1c1aab2c9e720399d66d8db5f40515d47c45ecfMark Andrews /* Locked by socket lock. */
unsigned int references;
int pf;
#ifdef ISC_SOCKET_NAMES
void * tag;
unsigned int pending_recv;
unsigned int pending_send;
static int iocp_total = 0;
typedef struct IoCompletionInfo {
int request_type;
struct event_change {
unsigned int action;
typedef struct sock_event_list {
int max_event;
int total_events;
struct events_thread {
struct isc_socketmgr {
unsigned int magic;
int event_written;
int maxIOCPThreads;
#if defined(ISC_SOCKET_DEBUG)
int errval;
strbuf);
int errval;
for (i = 0; i < total_threads; i++) {
manager, 0,
strbuf);
* Create/initialise the I/O completion port
int errval;
strbuf);
for (i = 0; i < MAX_EVENTS; i++) {
return (ISC_R_UNEXPECTED);
return (ISC_R_SUCCESS);
int max_event;
return (ISC_FALSE);
return (ISC_TRUE);
return (ISC_FALSE);
iEvent = i;
return (ISC_FALSE);
if (dofree)
return (ISC_TRUE);
if (del) {
if (del) {
return (ISC_R_SUCCESS);
unsigned int action)
sizeof(event_change_t));
int stat;
const char *msg;
return (ISC_R_UNEXPECTED);
return (ISC_R_UNEXPECTED);
return (ISC_R_SUCCESS);
initialise(void) {
int err;
if (err != 0) {
strbuf);
InitSockets(void) {
if (!initialised)
int Result;
int total_sent;
*Error = 0;
NULL);
switch (*Error) {
case WSA_IO_INCOMPLETE :
case WSA_WAIT_IO_COMPLETION :
case WSA_IO_PENDING :
case NO_ERROR :
return (total_sent);
int total_bytes = 0;
int Result;
*Error = 0;
&NumBytes,
&Flags,
NULL);
switch (*Error) {
case WSA_IO_INCOMPLETE:
case WSA_WAIT_IO_COMPLETION:
case WSA_IO_PENDING:
case NO_ERROR:
return (total_bytes);
const char *fmt, ...)
static isc_result_t
int ret;
return (ISC_R_UNEXPECTED);
return (ISC_R_SUCCESS);
return (ISC_R_SUCCESS);
return (ISC_R_UNEXPECTED);
unsigned int iovcount;
write_count = 0;
iovcount = 0;
goto config;
+ skip_count);
skip_count = 0;
iovcount++;
unsigned int iovcount;
read_count = 0;
iovcount = 0;
iovcount++;
static isc_socketevent_t *
sizeof(*ev));
return (NULL);
ev->n = 0;
return (ev);
#if defined(ISC_SOCKET_DEBUG)
return (DOIO_HARD); \
return (DOIO_SOFT); \
return (DOIO_HARD); \
if (recv_errno != 0) {
return (DOIO_SOFT);
return (DOIO_HARD);
return (DOIO_EOF);
return (DOIO_SOFT);
#ifdef ISC_NET_RECVOVERFLOW
cc--;
actual_count = 0;
return (DOIO_SOFT);
return (DOIO_SUCCESS);
int *recv_errno)
int status;
sizeof(IoCompletionInfo));
#if defined(ISC_SOCKET_DEBUG)
if (*nbytes < 0) {
goto done;
goto done;
goto done;
done:
return (status);
if(send_errno != 0) {
return (DOIO_SOFT);
return (DOIO_HARD); \
return (DOIO_SOFT); \
return (DOIO_HARD); \
return (DOIO_HARD);
return (DOIO_SOFT);
return (DOIO_SUCCESS);
int *send_errno)
int status;
sizeof(IoCompletionInfo));
if (*nbytes < 0) {
goto done;
goto done;
goto done;
done:
return (status);
if (dofree)
static isc_result_t
return (ISC_R_NOMEMORY);
goto error;
return (ISC_R_SUCCESS);
return (result);
#if defined(USE_CMSG)
#if defined(SO_RCVBUF)
int size;
int socket_errno;
return (result);
switch (type) {
case isc_sockettype_udp:
return (result);
case isc_sockettype_tcp:
switch (socket_errno) {
case WSAEMFILE:
case WSAENOBUFS:
return (ISC_R_NORESOURCES);
case WSAEPROTONOSUPPORT:
case WSAEPFNOSUPPORT:
case WSAEAFNOSUPPORT:
return (ISC_R_FAMILYNOSUPPORT);
strbuf);
return (ISC_R_UNEXPECTED);
return (result);
#if defined(USE_CMSG)
#if defined(ISC_PLATFORM_HAVEIPV6)
#ifdef IPV6_RECVPKTINFO
strbuf);
strbuf);
#if defined(SO_RCVBUF)
return (ISC_R_SUCCESS);
if (kill_socket)
if (accept_errno != 0) {
switch (accept_errno) {
case WSAENETDOWN:
case WSAECONNRESET:
case WSAECONNABORTED:
strbuf);
addrbuf);
(void *)&addrlen);
goto soft_error;
strbuf);
if (addrlen == 0) {
goto soft_error;
goto soft_error;
int stat;
const char *msg;
* as it is handled by IOCP. (Joe Quanaim, lucent.com)
int stat;
const char *msg;
if (connect_errno != 0) {
switch (connect_errno) {
strbuf);
int io_state;
int cc;
goto done;
case DOIO_SOFT:
cc = 0;
recv_errno = 0;
goto done;
case DOIO_EOF:
goto done;
case DOIO_SUCCESS:
case DOIO_HARD:
done:
goto done;
case DOIO_SOFT:
case DOIO_HARD:
case DOIO_SUCCESS:
done:
int request;
int errval;
int errstatus;
strbuf);
while (TRUE) {
INFINITE);
errstatus = 0;
if (!bSuccess) {
case SOCKET_RECV:
case SOCKET_SEND:
if (dofree)
switch (request) {
case SOCKET_RECV:
case SOCKET_SEND:
return ((isc_threadresult_t)0);
int cc;
int event_errno;
int iEvent;
int max_event;
int err;
event_errno = 0;
FALSE);
sizeof(strbuf));
strbuf);
if (iEvent == 0)
strbuf);
return ((isc_threadresult_t)0);
return (ISC_R_NOMEMORY);
InitSockets();
return (result);
return (ISC_R_UNEXPECTED);
return (result);
return (ISC_R_SUCCESS);
!= ISC_R_SUCCESS)
static isc_result_t
unsigned int flags)
int io_state;
int cc = 0;
int recv_errno = 0;
switch (io_state) {
case DOIO_SOFT:
case DOIO_EOF:
case DOIO_HARD:
case DOIO_SUCCESS:
return (result);
unsigned int iocount;
return (ISC_R_NOMEMORY);
if (minimum == 0)
return (ISC_R_NOMEMORY);
event->n = 0;
if (minimum == 0)
static isc_result_t
unsigned int flags)
int io_state;
int send_errno = 0;
int cc = 0;
switch (io_state) {
case DOIO_SOFT:
if (!have_lock) {
case DOIO_SUCCESS:
if (have_lock)
return (result);
NULL));
return (ISC_R_NOMEMORY);
NULL));
unsigned int iocount;
return (ISC_R_NOMEMORY);
event->n = 0;
int bind_errno;
return (ISC_R_FAMILYMISMATCH);
sizeof(on)) < 0) {
switch (bind_errno) {
case WSAEACCES:
return (ISC_R_NOPERM);
case WSAEADDRNOTAVAIL:
return (ISC_R_ADDRNOTAVAIL);
case WSAEADDRINUSE:
return (ISC_R_ADDRINUSE);
case WSAEINVAL:
return (ISC_R_BOUND);
strbuf);
return (ISC_R_UNEXPECTED);
return (ISC_R_SUCCESS);
return (ISC_R_NOTIMPLEMENTED);
if (backlog == 0)
return (ISC_R_UNEXPECTED);
sizeof(strbuf));
return (retstat);
return (ISC_R_SUCCESS);
return (ISC_R_NOMEMORY);
return (result);
int stat;
const char *msg;
return (ISC_R_UNEXPECTED);
return (ISC_R_SUCCESS);
int cc;
int retstat;
int errval;
return (ISC_R_MULTICAST);
sizeof(*dev));
return (ISC_R_NOMEMORY);
if (cc < 0) {
goto queue;
switch (errval) {
return (ISC_R_UNEXPECTED);
return (ISC_R_SUCCESS);
if (cc == 0) {
return (ISC_R_SUCCESS);
sizeof(strbuf));
return (retstat);
return (ISC_R_SUCCESS);
return (result);
goto out;
strbuf);
goto out;
out:
return (result);
if (how == 0)
ev_link);
int stat;
const char *msg;
return (val);
#if defined(IPV6_V6ONLY)
#ifdef IPV6_V6ONLY
return (ISC_R_NOTIMPLEMENTED);
#ifdef ISC_SOCKET_NAMES