socket.c revision dfd9b3fa62216d2da842db63638325f6c6036835
/*
* Copyright (C) 1998 Internet Software Consortium.
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
* ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
* CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
* DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
* PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*/
#include <config.h>
#include <errno.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <isc/assertions.h>
#include <isc/condition.h>
#include "util.h"
#ifndef _WIN32
#define WINAPI /* we're not windows */
#endif
#define ISC_TASK_SEND(a, b) do { \
} while (0);
#if 1
#define ISC_SOCKET_DEBUG
#endif
#if defined(ISC_SOCKET_DEBUG)
#define TRACE_WATCHER 0x0001
#define TRACE_LISTEN 0x0002
#define TRACE_CONNECT 0x0004
#define TRACE_RECV 0x0008
#define TRACE_SEND 0x0010
#define TRACE_MANAGER 0x0020
int trace_level = 0xffffffff;
#else
#define XTRACE(l, a)
#define XENTER(l, a)
#define XEXIT(l, a)
#endif
/*
* functions.
*/
typedef struct rwintev {
} rwintev_t;
typedef struct ncintev {
} ncintev_t;
typedef struct cnintev {
} cnintev_t;
struct isc_socket {
/* Not locked. */
unsigned int magic;
/* Locked by socket lock. */
unsigned int references;
int fd;
int addrlength; /* remote addrlen */
};
#define VALID_MANAGER(m) ((m) != NULL && \
(m)->magic == SOCKET_MANAGER_MAGIC)
struct isc_socketmgr {
/* Not locked. */
unsigned int magic;
/* Locked by manager lock. */
unsigned int nsockets; /* sockets managed */
int fdstate[FD_SETSIZE];
int maxfd;
int pipe_fds[2];
};
#define CLOSED 0 /* this one must be zero */
#define MANAGED 1
#define CLOSE_PENDING 2
static void done_event_destroy(isc_event_t *);
static void free_socket(isc_socket_t **);
isc_socket_t **);
static void destroy(isc_socket_t **);
#define SELECT_POKE_SHUTDOWN (-1)
#define SELECT_POKE_NOTHING (-2)
/*
* Poke the select loop when there is something for us to do.
* We assume that if a write completes here, it will be inserted into the
* queue fully. That is, we will not get partial writes.
*/
static void
{
int cc;
if (cc < 0) /* XXX need to handle EAGAIN, EINTR here */
"write() failed during watcher poke: %s",
}
/*
* read a message on the internal fd.
*/
static int
{
int msg;
int cc;
if (cc < 0) {
if (SOFT_ERROR(errno))
return (SELECT_POKE_NOTHING);
"read() failed during watcher poke: %s",
return (SELECT_POKE_NOTHING);
}
return (msg);
}
/*
* Make a fd non-blocking
*/
static isc_result_t
make_nonblock(int fd)
{
int ret;
int flags;
flags |= O_NONBLOCK;
if (ret == -1) {
"fcntl(%d, F_SETFL, %d): %s",
return (ISC_R_UNEXPECTED);
}
return (ISC_R_SUCCESS);
}
#ifdef ISC_SOCKET_DEBUG
static void
{
printf("recv queue:\n");
printf("\tintev %p, done_ev %p, task %p, "
"canceled %d, posted %d",
}
printf("send queue:\n");
printf("\tintev %p, done_ev %p, task %p, "
"canceled %d, posted %d",
}
printf("accept queue:\n");
printf("\tintev %p, done_ev %p, task %p, "
"canceled %d, posted %d\n",
}
printf("--------\n");
}
#endif
/*
* Handle freeing a done event when needed.
*/
static void
{
/*
* detach from the socket. We would have already detached from the
* task when we actually queue this event up.
*/
sock->references--;
if (sock->references == 0)
if (kill_socket)
}
/*
* Kill.
*
* Caller must ensure locking.
*/
static void
{
if (sock->connect_ev)
/*
* Noone has this socket open, so the watcher doesn't have to be
* poked, and the socket doesn't have to be locked.
*/
/*
* XXX should reset manager->maxfd here
*/
}
static isc_result_t
{
return (ISC_R_NOMEMORY);
sock->references = 0;
/*
* set up list of readers and writers to be initially empty
*/
sock->addrlength = 0;
/*
* initialize the lock
*/
"isc_mutex_init() failed");
return (ISC_R_UNEXPECTED);
}
return (ISC_R_SUCCESS);
}
/*
* This event requires that the various lists be empty, that the reference
* count be 1, and that the magic number is valid. The other socket bits,
* like the lock, must be initialized as well. The fd associated must be
* marked as closed, by setting it to -1 on close, or this routine will
* also close the socket.
*/
static void
{
}
/*
* Create a new 'type' socket managed by 'manager'. The sockets
* parameters are specified by 'expires' and 'interval'. Events
* will be posted to 'task' and when dispatched 'action' will be
* called with 'arg' as the arg value. The new socket is returned
* in 'socketp'.
*/
{
if (ret != ISC_R_SUCCESS)
return (ret);
switch (type) {
case isc_socket_udp:
break;
case isc_socket_tcp:
break;
}
free_socket(&sock);
switch (errno) {
case EMFILE:
case ENFILE:
case ENOBUFS:
return (ISC_R_NORESOURCES);
break;
default:
"socket() failed: %s",
return (ISC_R_UNEXPECTED);
break;
}
}
free_socket(&sock);
return (ISC_R_UNEXPECTED);
}
/*
* Note we don't have to lock the socket like we normally would because
* there are no external references to it yet.
*/
return (ISC_R_SUCCESS);
}
/*
* Attach to a socket. Caller must explicitly detach when it is done.
*/
void
{
sock->references++;
}
/*
* Dereference a socket. If this is the last reference to it, clean things
* up by destroying the socket.
*/
void
{
sock->references--;
if (sock->references == 0)
if (kill_socket)
}
/*
* I/O is possible on a given socket. Schedule an event to this task that
* will call an internal function to do the I/O. This will charge the
* task with the I/O operation and let our select loop handler get back
* to doing something real as fast as possible.
*
* The socket and manager must be locked before calling this function.
*/
static void
{
}
static void
{
}
static void
{
}
static void
{
}
/*
* Dequeue an item off the given socket's read queue, set the result code
* in the done event to the one provided, and send it to the task it was
* destined for.
*
* Caller must have the socket locked.
*/
static void
{
}
static void
{
}
static void
{
}
/*
* Call accept() on a socket, to get the new file descriptor. The listen
* socket is used as a prototype to create a new isc_socket_t. The new
* socket is referenced twice (one for the task which is receiving this
* message, and once for the message itself) so the task does not need to
* attach to the socket again. The task is not attached at all.
*/
static void
{
int fd;
("internal_accept called, locked parent sock %p\n", sock));
/*
* Has this event been canceled?
*/
return;
}
/*
* Try to accept the new connection. If the accept fails with
* EAGAIN or EINTR, simply poke the watcher to watch this socket
* again.
*/
if (fd < 0) {
if (SOFT_ERROR(errno)) {
return;
}
/*
* If some other error, ignore it as well and hope
* for the best, but log it.
*/
fd = -1;
}
fd = -1;
}
/*
* The accept succeeded. Pull off the done event and set the
* fd and other information in the socket descriptor here. These
* were preallocated for us.
*/
/*
* -1 means the new socket didn't happen.
*/
if (fd != -1) {
/*
* Save away the remote address
*/
}
}
static void
{
int cc;
/*
* Find out what socket this is and lock it.
*/
/*
* Pull the first entry off the list, and look at it. If it is
* NULL, or not ours, something bad happened.
*/
/*
* Try to do as much I/O as possible on this socket. There are no
* limits here, currently. If some sort of quantum read count is
* desired before giving up control, make certain to process markers
* regardless of quantum.
*/
do {
/*
* check for canceled I/O
*/
goto next;
}
/*
* If this is a marker event, post its completion and
* continue the loop.
*/
sock->recv_result);
goto next;
}
/*
* It must be a read request. Try to satisfy it as best
* we can.
*/
read_count, 0,
&addrlen);
} else {
read_count, 0);
}
/*
* check for error or block condition
*/
if (cc < 0) {
if (SOFT_ERROR(errno))
goto poke;
#if 0
} \
goto next; \
}
/*
* This might not be a permanent error.
*/
goto next;
}
#endif
goto next;
}
/*
* read of 0 means the remote end was closed. Run through
* the event queue and dispatch all the events with an EOF
* result code. This will set the EOF flag in markers as
* well, but that's really ok.
*/
if (cc == 0) {
do {
goto poke;
}
/*
* if we read less than we expected, update counters,
* poke.
*/
/*
* If partial reads are allowed, we return whatever
* was read with a success result, and continue
* the loop.
*/
goto next;
}
/*
* Partials not ok. Exit the loop and notify the
* watcher to wait for more reads
*/
goto poke;
}
/*
* Exactly what we wanted to read. We're done with this
* entry. Post its completion event.
*/
dev->n += read_count;
}
next:
; /* some compilers need this here... */
poke:
}
static void
{
int cc;
/*
* Find out what socket this is and lock it.
*/
/*
* Pull the first entry off the list, and look at it. If it is
* NULL, or not ours, something bad happened.
*/
/*
* Try to do as much I/O as possible on this socket. There are no
* limits here, currently. If some sort of quantum write count is
* desired before giving up control, make certain to process markers
* regardless of quantum.
*/
do {
/*
* check for canceled I/O
*/
goto next;
}
/*
* If this is a marker event, post its completion and
* continue the loop.
*/
sock->send_result);
goto next;
}
/*
* It must be a write request. Try to satisfy it as best
* we can.
*/
write_count, 0,
(int)dev->addrlength);
else
write_count, 0);
/*
* check for error or block condition
*/
if (cc < 0) {
if (SOFT_ERROR(errno))
goto poke;
} \
goto next; \
}
/*
* This might not be a permanent error.
*/
goto next;
}
/*
* The other error types depend on wether or not the
* socket is UDP or TCP. If it is UDP, some errors
* that we expect to be fatal under TCP are merely
* annoying, and are really soft errors.
*
* However, these soft errors are still returned as
* a status.
*/
"internal_send: %s",
goto next;
}
if (cc == 0)
"internal_send: send() returned 0");
/*
* if we write less than we expected, update counters,
* poke.
*/
goto poke;
}
/*
* Exactly what we wanted to write. We're done with this
* entry. Post its completion event.
*/
dev->n += write_count;
goto next;
}
next:
; /* some compilers need this here... */
poke:
}
/*
* This is the thread that will loop forever, always in a select or poll
* call.
*
* When select returns something to do, track down what thread gets to do
* this I/O and post the event to it.
*/
static isc_threadresult_t
{
int ctlfd;
int cc;
int msg;
int i;
int maxfd;
/*
* Get the control fd here. This will never change.
*/
while (!done) {
do {
#ifdef ISC_SOCKET_DEBUG
for (i = 0 ; i < FD_SETSIZE ; i++) {
int printit;
printit = 0;
printf("watcher: select r on %d\n", i);
printit = 1;
}
printf("watcher: select w on %d\n", i);
printit = 1;
}
}
#endif
("select(%d, ...) == %d, errno %d\n",
if (cc < 0) {
if (!SOFT_ERROR(errno))
"select failed: %s",
}
} while (cc < 0);
/*
* Process reads on internal, control fd.
*/
while (1) {
("watcher got message %d\n", msg));
/*
* Nothing to read?
*/
if (msg == SELECT_POKE_NOTHING)
break;
/*
* handle shutdown message. We really should
* jump out of this loop right away, but
* it doesn't matter if we have to do a little
* more work first.
*/
if (msg == SELECT_POKE_SHUTDOWN) {
("watcher got SHUTDOWN\n"));
break;
}
/*
* This is a wakeup on a socket. Look
* at the event queue for both read and write,
* and decide if we need to watch on it now
* or not.
*/
if (msg >= 0) {
("Watcher closed %d\n",
msg));
continue;
}
continue;
("watcher locked socket %p\n",
sock));
/*
* If there are no events, or there
* is an event but we have already
* queued up the internal event on a
* task's queue, clear the bit.
* Otherwise, set it.
*/
|| sock->pending_recv
|| sock->pending_accept) {
("watch cleared r\n"));
} else {
("watch set r\n"));
}
|| sock->pending_send)
&& !sock->connecting) {
("watch cleared w\n"));
} else {
("watch set w\n"));
}
}
}
}
/*
* and unlocking twice if both reads and writes are possible.
*/
for (i = 0 ; i < maxfd ; i++) {
continue;
close(i);
("Watcher closed %d\n", i));
continue;
}
goto check_write;
}
("watcher r on %d, sock %p\n",
else
}
continue;
}
("watcher w on %d, sock %p\n",
if (!unlock_sock) {
}
if (sock->connecting)
else
}
if (unlock_sock)
}
}
return ((isc_threadresult_t)0);
}
/*
* Create a new socket manager.
*/
{
return (ISC_R_NOMEMORY);
"isc_mutex_init() failed");
return (ISC_R_UNEXPECTED);
}
/*
* Create the special fds that will be used to wake up the
*/
"pipe() failed: %s",
return (ISC_R_UNEXPECTED);
}
/*
* Set up initial state for the select loop
*/
/*
*/
"isc_thread_create() failed");
return (ISC_R_UNEXPECTED);
}
return (ISC_R_SUCCESS);
}
void
{
int i;
/*
* Destroy a socket manager.
*/
/*
* XXX do this right, with a condition variable
*/
sleep(1);
}
/*
* half of the pipe, which will send EOF to the read half.
*/
/*
* Wait for thread to exit.
*/
"isc_thread_join() failed");
/*
* Clean up.
*/
for (i = 0 ; i < FD_SETSIZE ; i++)
close(i);
}
{
int cc;
return (ISC_R_NOMEMORY);
sock,
sock,
sizeof(*iev));
/* no special free routine yet */
return (ISC_R_NOMEMORY);
}
}
/*
* Remember that we need to detach on event free
*/
/*
* UDP sockets are always partial read
*/
ev->n = 0;
/*
* If the read queue is empty, try to do the I/O right now.
*/
&ev->addrlength);
} else {
}
if (cc < 0) {
if (SOFT_ERROR(errno))
goto queue;
"isc_socket_recv: %s",
return (ISC_R_SUCCESS);
}
if (cc == 0) {
return (ISC_R_SUCCESS);
}
/*
* Partial reads need to be queued
*/
goto queue;
/*
* full reads are posted, or partials if partials are ok.
*/
return (ISC_R_SUCCESS);
}
/*
* We couldn't read all or part of the request right now, so queue
* it.
*/
/*
* Enqueue the request. If the socket was previously not being
* watched, poke the watcher to start paying attention to it.
*/
} else {
}
("isc_socket_recv: posted ievent %p, dev %p, task %p\n",
return (ISC_R_SUCCESS);
}
{
}
{
int cc;
return (ISC_R_NOMEMORY);
sock,
sock,
sizeof(*iev));
/* no special free routine yet */
return (ISC_R_NOMEMORY);
}
}
/*
* Remember that we need to detach on event free
*/
ev->n = 0;
/*
* If the write queue is empty, try to do the I/O right now.
*/
if (addrlength > 0) {
} else if (sock->addrlength > 0) {
}
INSIST(addrlength == 0);
}
(int)ev->addrlength);
else {
"isc_socket_send: "
"unknown socket type");
return (ISC_R_UNEXPECTED);
}
if (cc < 0) {
if (SOFT_ERROR(errno))
goto queue;
"isc_socket_send: %s",
return (ISC_R_UNEXPECTED);
}
if (cc == 0) {
return (ISC_R_SUCCESS);
}
/*
* Partial writes need to be queued
*/
goto queue;
/*
* full writes are posted.
*/
return (ISC_R_SUCCESS);
}
/*
* We couldn't send all or part of the request right now, so queue
* it.
*/
/*
* Enqueue the request. If the socket was previously not being
* watched, poke the watcher to start paying attention to it.
*/
} else {
}
("isc_socket_send: posted ievent %p, dev %p, task %p\n",
return (ISC_R_SUCCESS);
}
int addrlen)
{
int on = 1;
/* Press on... */
}
switch (errno) {
case EACCES:
return (ISC_R_NOPERM);
break;
case EADDRNOTAVAIL:
return (ISC_R_ADDRNOTAVAIL);
break;
case EADDRINUSE:
return (ISC_R_ADDRINUSE);
break;
case EINVAL:
return (ISC_R_BOUND);
break;
default:
return (ISC_R_UNEXPECTED);
break;
}
}
return (ISC_R_SUCCESS);
}
/*
* set up to listen on a given socket. We do this by creating an internal
* event that will be dispatched when the socket has read activity. The
* watcher will send the internal event to the task when there is a new
* connection.
*
* Unlike in read, we don't preallocate a done event here. Every time there
* is a new connection we'll have to allocate a new one anyway, so we might
* as well keep things simple rather than having to track them.
*/
{
if (backlog == 0)
return (ISC_R_UNEXPECTED);
}
return (ISC_R_SUCCESS);
}
/*
* This should try to do agressive accept()
*/
{
sizeof(*iev));
return (ISC_R_NOMEMORY);
}
dev = (isc_socket_newconnev_t *)
sock,
arg,
sizeof (*dev));
return (ISC_R_NOMEMORY);
}
if (ret != ISC_R_SUCCESS) {
return (ret);
}
/*
* Attach to socket and to task
*/
sock->references++;
nsock->references++;
/*
* poke watcher here. We still have the socket locked, so there
* is no race condition. We will keep the lock for such a short
* bit of time waking it up now or later won't matter all that much.
*/
return (ISC_R_SUCCESS);
}
{
int cc;
sock,
sock,
return (ISC_R_NOMEMORY);
}
}
sock,
arg,
sizeof (*dev));
return (ISC_R_NOMEMORY);
}
/*
* Try to do the connect right away, as there can be only one
* outstanding, and it might happen to complete.
*/
if (cc < 0) {
goto queue;
return (ISC_R_UNEXPECTED);
}
/*
* attach to socket
*/
sock->references++;
/*
* If connect completed, fire off the done event
*/
if (cc == 0) {
return (ISC_R_SUCCESS);
}
/*
* Attach to to task
*/
/*
* poke watcher here. We still have the socket locked, so there
* is no race condition. We will keep the lock for such a short
* bit of time waking it up now or later won't matter all that much.
*/
return (ISC_R_SUCCESS);
}
/*
* Called when a socket with a pending connect() finishes.
*/
static void
{
int cc;
int optlen;
("internal_connect called, locked parent sock %p\n", sock));
/*
* Has this event been canceled?
*/
return;
}
/*
* Get any possible error status here.
*/
else
if (errno != 0) {
/*
* If the error is EAGAIN, just re-select on this
* fd and pretend nothing strange happened.
*/
return;
}
/*
* Translate other errors into ISC_R_* flavors.
*/
switch (errno) {
case ETIMEDOUT:
break;
case ECONNREFUSED:
break;
case ENETUNREACH:
break;
default:
"internal_connect: connect() %s",
break;
}
}
}
int *lengthp)
{
return (ISC_R_TOOSMALL);
}
return (ISC_R_SUCCESS);
}
int *lengthp)
{
int len;
return (ISC_R_UNEXPECTED);
}
return (ISC_R_TOOSMALL);
}
return (ISC_R_SUCCESS);
}
/*
* Run through the list of events on this socket, and cancel the ones
* queued for task "task" of type "how". "how" is a bitmask.
*/
void
unsigned int how)
{
/*
* Quick exit if there is nothing to do. Don't even bother locking
* in this case.
*/
if (how == 0)
return;
/*
* All of these do the same thing, more or less.
* Each will:
* o If the internal event is marked as "posted" try to
* remove it from the task's queue. If this fails, mark it
* as canceled instead, and let the task clean it up later.
* o For each I/O request for that task of that type, post
* its done event with status of "ISC_R_CANCELED".
* o Reset any state needed.
*/
/*
* If the internal event was posted, try to remove
* it from the task's queue. If this fails,
* set the canceled flag, post the done event, and
* point "iev" to the next item on the list, and enter
* the while loop. Otherwise, just enter the while loop
* and let it dispatch the done event.
*/
ISC_SOCKEVENT_INTRECV) == 0) {
/*
* pull off the done event and post it.
*/
(isc_event_t **)&dev);
}
}
/*
* run through the event queue, posting done events with the
* canceled result, and freeing the internal event.
*/
}
}
/*
* If the internal event was posted, try to remove
* it from the task's queue. If this fails,
* set the canceled flag, post the done event, and
* point "iev" to the next item on the list, and enter
* the while loop. Otherwise, just enter the while loop
* and let it dispatch the done event.
*/
ISC_SOCKEVENT_INTSEND) == 0) {
/*
* pull off the done event and post it.
*/
(isc_event_t **)&dev);
}
}
/*
* run through the event queue, posting done events with the
* canceled result, and freeing the internal event.
*/
}
}
ISC_SOCKEVENT_INTACCEPT) == 0) {
(isc_event_t **)&dev);
}
}
}
}
}
if (how & ISC_SOCKCANCEL_CONNECT) {
}
/*
* Need to guess if we need to poke or not... XXX
*/
}
{
sizeof(*dev));
return (ISC_R_NOMEMORY);
/*
* If the queue is empty, simply return the last error we got on
* this socket as the result code, and send off the done event.
*/
sock->references++;
return (ISC_R_SUCCESS);
}
/*
* Bad luck. The queue wasn't empty. Insert this in the proper
* place.
*/
sock,
sock,
sizeof(*iev));
return (ISC_R_NOMEMORY);
}
sock->references++;
("isc_socket_recvmark: posted ievent %p, dev %p, task %p\n",
return (ISC_R_SUCCESS);
}
{
sizeof(*dev));
return (ISC_R_NOMEMORY);
/*
* If the queue is empty, simply return the last error we got on
* this socket as the result code, and send off the done event.
*/
sock->references++;
return (ISC_R_SUCCESS);
}
/*
* Bad luck. The queue wasn't empty. Insert this in the proper
* place.
*/
sock,
sock,
sizeof(*iev));
return (ISC_R_NOMEMORY);
}
sock->references++;
("isc_socket_sendmark: posted ievent %p, dev %p, task %p\n",
return (ISC_R_SUCCESS);
}