fileaio-linux.cpp revision 3e3726d7acce34f5b1e11d83c9bfa45082605a06
/* $Id$ */
/** @file
* IPRT - File async I/O, native implementation for the Linux host platform.
*/
/*
* Copyright (C) 2006-2007 Sun Microsystems, Inc.
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*
* The contents of this file may alternatively be used under the terms
* of the Common Development and Distribution License Version 1.0
* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
* VirtualBox OSE distribution, in which case the provisions of the
* CDDL are applicable instead of those of the GPL.
*
* You may elect to license modified versions of this file under the
* terms and conditions of either the GPL or the CDDL or both.
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
* Clara, CA 95054 USA or visit http://www.sun.com if you need
* additional information or have any questions.
*/
/** @page pg_rtfileaio_linux RTFile Async I/O - Linux Implementation Notes
* @internal
*
* Linux implements the kernel async I/O API through the io_* syscalls. They are
* not exposed in the glibc (the aio_* API uses userspace threads and blocking
* I/O operations to simulate async behavior). There is an external library
* called libaio which implements these syscalls but because we don't want to
* have another dependency and this library is not installed by default and the
* interface is really simple we use the kernel interface directly using wrapper
* functions.
*
* The interface has some limitations. The first one is that the file must be
* opened with O_DIRECT. This disables caching done by the kernel which can be
* compensated if the user of this API implements caching itself. The next
* limitation is that data buffers must be aligned at a 512 byte boundary or the
* request will fail.
*/
/** @todo r=bird: What's this about "must be opened with O_DIRECT"? An
* explanation would be nice, esp. seeing what Linus is quoted saying
* about it in the open man page... */
/*******************************************************************************
* Header Files *
*******************************************************************************/
#define LOG_GROUP RTLOGGROUP_FILE
#define _LINUX_BYTEORDER_SWABB_H
#include <unistd.h>
#include <errno.h>
/*******************************************************************************
* Structures and Typedefs *
*******************************************************************************/
/**
* The iocb structure of a request which is passed to the kernel.
*
* We redefined this here because the version in the header lacks padding
* for 32bit.
*/
typedef struct LNXKAIOIOCB
{
/** Opaque pointer to data which is returned on an I/O event. */
void *pvUser;
#ifdef RT_ARCH_X86
#endif
/** Contains the request number and is set by the kernel. */
/** Reserved. */
/** The I/O opcode. */
/** Request priority. */
/** The file descriptor. */
/** The userspace pointer to the buffer containing/receiving the data. */
void *pvBuf;
#ifdef RT_ARCH_X86
#endif
/** How many bytes to transfer. */
#ifdef RT_ARCH_X86
#elif defined(RT_ARCH_AMD64)
#else
# error "Unknown architecture"
#endif
/** At which offset to start the transfer. */
/** Reserved. */
/** Flags */
/** Readyness signal file descriptor. */
} LNXKAIOIOCB, *PLNXKAIOIOCB;
/**
* I/O event structure to notify about completed requests.
* Redefined here too because of the padding.
*/
typedef struct LNXKAIOIOEVENT
{
/** The pvUser field from the iocb. */
void *pvUser;
#ifdef RT_ARCH_X86
#endif
/** The LNXKAIOIOCB object this event is for. */
#ifdef RT_ARCH_X86
#endif
/** The result code of the operation .*/
#ifdef RT_ARCH_X86
#elif defined(RT_ARCH_AMD64)
#else
# error "Unknown architecture"
#endif
/** Secondary result code. */
#ifdef RT_ARCH_X86
#elif defined(RT_ARCH_AMD64)
#else
# error "Unknown architecture"
#endif
/**
* Async I/O completion context state.
*/
typedef struct RTFILEAIOCTXINTERNAL
{
/** Handle to the async I/O context. */
/** Maximum number of requests this context can handle. */
int cRequestsMax;
/** Current number of requests active on this context. */
/** The ID of the thread which is currently waiting for requests. */
volatile RTTHREAD hThreadWait;
/** Flag whether the thread was woken up. */
volatile bool fWokenUp;
/** Flag whether the thread is currently waiting in the syscall. */
volatile bool fWaiting;
/** Magic value (RTFILEAIOCTX_MAGIC). */
/** Pointer to an internal context structure. */
typedef RTFILEAIOCTXINTERNAL *PRTFILEAIOCTXINTERNAL;
/**
* Async I/O request state.
*/
typedef struct RTFILEAIOREQINTERNAL
{
/** The aio control block. This must be the FIRST elment in
* the structure! (see notes below) */
/** The I/O context this request is associated with. */
/** Return code the request completed with. */
int Rc;
/** Flag whether the request is in process or not. */
bool fFinished;
/** Number of bytes actually trasnfered. */
/** Completion context we are assigned to. */
/** Magic value (RTFILEAIOREQ_MAGIC). */
/** Pointer to an internal request structure. */
typedef RTFILEAIOREQINTERNAL *PRTFILEAIOREQINTERNAL;
/*******************************************************************************
* Defined Constants And Macros *
*******************************************************************************/
/** The max number of events to get in one call. */
#define AIO_MAXIMUM_REQUESTS_PER_CONTEXT 64
/**
* Creates a new async I/O context.
*/
{
return RTErrConvertFromErrno(errno);
return VINF_SUCCESS;
}
/**
* Destroys a async I/O context.
*/
{
return RTErrConvertFromErrno(errno);
return VINF_SUCCESS;
}
/**
* Submits an array of I/O requests to the kernel.
*/
DECLINLINE(int) rtFileAsyncIoLinuxSubmit(aio_context_t AioContext, long cReqs, LNXKAIOIOCB **ppIoCB)
{
return RTErrConvertFromErrno(errno);
return VINF_SUCCESS;
}
/**
* Cancels a I/O request.
*/
DECLINLINE(int) rtFileAsyncIoLinuxCancel(aio_context_t AioContext, PLNXKAIOIOCB pIoCB, PLNXKAIOIOEVENT pIoResult)
{
return RTErrConvertFromErrno(errno);
return VINF_SUCCESS;
}
/**
* Waits for I/O events.
* @returns Number of events (natural number w/ 0), IPRT error code (negative).
*/
{
return RTErrConvertFromErrno(errno);
return rc;
}
{
/*
* Allocate a new request and initialize it.
*/
if (RT_UNLIKELY(!pReqInt))
return VERR_NO_MEMORY;
return VINF_SUCCESS;
}
{
/*
* Validate the handle and ignore nil.
*/
if (hReq == NIL_RTFILEAIOREQ)
return;
/*
* Trash the magic and free it.
*/
}
/**
* Worker setting up the request.
*/
void *pvUser)
{
/*
* Validate the input.
*/
Assert(cbTransfer > 0);
/*
* Setup the control block and clear the finished flag.
*/
return VINF_SUCCESS;
}
{
}
{
}
{
/** @todo: Flushing is not neccessary on Linux because O_DIRECT is mandatory
* which disables caching.
* We could setup a fake request which isn't really executed
* to avoid platform dependent code in the caller.
*/
#if 0
#endif
return VERR_NOT_IMPLEMENTED;
}
{
}
{
if (RT_SUCCESS(rc))
{
/*
* Decrement request count because the request will never arrive at the
* completion port.
*/
("Invalid state. Request was canceled but wasn't submitted\n"));
return VINF_SUCCESS;
}
if (rc == VERR_TRY_AGAIN)
return VERR_FILE_AIO_IN_PROGRESS;
return rc;
}
{
return VERR_FILE_AIO_IN_PROGRESS;
if ( pcbTransfered
}
{
/* The kernel interface needs a maximum. */
if (cAioReqsMax == RTFILEAIO_UNLIMITED_REQS)
return VERR_OUT_OF_RANGE;
if (RT_UNLIKELY(!pCtxInt))
return VERR_NO_MEMORY;
/* Init the event handle. */
if (RT_SUCCESS(rc))
{
}
else
return rc;
}
{
/* Validate the handle and ignore nil. */
if (hAioCtx == NIL_RTFILEAIOCTX)
return VINF_SUCCESS;
/* Cannot destroy a busy context. */
return VERR_FILE_AIO_BUSY;
/* The native bit first, then mark it as dead and free it. */
if (RT_FAILURE(rc))
return rc;
return VINF_SUCCESS;
}
{
/* Nil means global here. */
if (hAioCtx == NIL_RTFILEAIOCTX)
return RTFILEAIO_UNLIMITED_REQS; /** @todo r=bird: I'm a bit puzzled by this return value since it
* is completely useless in RTFileAioCtxCreate. */
/* Return 0 if the handle is invalid, it's better than garbage I think... */
return pCtxInt->cRequestsMax;
}
{
/* Nothing to do. */
return VINF_SUCCESS;
}
RTDECL(int) RTFileAioCtxSubmit(RTFILEAIOCTX hAioCtx, PRTFILEAIOREQ pahReqs, size_t cReqs, size_t *pcReqs)
{
/*
* Parameter validation.
*/
*pcReqs = 0;
/*
* Vaildate requests and associate with the context.
*/
while (i-- > 0)
{
}
/*
* Add the submitted requests to the counter
* to prevent destroying the context while
* it is still used.
*/
/*
* We cast phReqs to the Linux iocb structure to avoid copying the requests
* into a temporary array. This is possible because the iocb structure is
* the first element in the request structure (see PRTFILEAIOCTXINTERNAL).
*/
if (RT_FAILURE(rc))
else
return rc;
}
{
/*
* Validate the parameters, making sure to always set pcReqs.
*/
*pcReqs = 0; /* always set */
/*
* Can't wait if there are not requests around.
*/
return VERR_FILE_AIO_NO_REQUEST;
/*
* Convert the timeout if specified.
*/
uint64_t StartNanoTS = 0;
if (cMillisTimeout != RT_INDEFINITE_WAIT)
{
StartNanoTS = RTTimeNanoTS();
}
/* Wait for at least one. */
if (!cMinReqs)
cMinReqs = 1;
/* For the wakeup call. */
/*
* Loop until we're woken up, hit an error (incl timeout), or
* have collected the desired number of requests.
*/
int rc = VINF_SUCCESS;
int cRequestsCompleted = 0;
{
rc = rtFileAsyncIoLinuxGetEvents(pCtxInt->AioContext, cMinReqs, cRequestsToWait, &aPortEvents[0], pTimeout);
if (RT_FAILURE(rc))
break;
rc = VINF_SUCCESS;
/*
* Process received events / requests.
*/
{
/*
* The iocb is the first element in our request structure.
* So we can safely cast it directly to the handle (see above)
*/
/** @todo aeichner: The rc field contains the result code
* But there is a second field called rc2. I don't know the
* purpose for it yet.
*/
else
{
}
/* Mark the request as finished. */
}
/*
* Done Yet? If not advance and try again.
*/
break;
if (cMillisTimeout != RT_INDEFINITE_WAIT)
{
/* The API doesn't return ETIMEDOUT, so we have to fix that ourselves. */
if (cMilliesElapsed >= cMillisTimeout)
{
rc = VERR_TIMEOUT;
break;
}
/* The syscall supposedly updates it, but we're paranoid. :-) */
}
}
/*
* Update the context state and set the return value.
*/
/*
* Clear the wakeup flag and set rc.
*/
&& RT_SUCCESS(rc))
{
}
return rc;
}
{
/** @todo r=bird: Define the protocol for how to resume work after calling
* this function. */
/*
* Read the thread handle before the status flag.
* If we read the handle after the flag we might
* end up with an invalid handle because the thread
* waiting in RTFileAioCtxWakeup() might get scheduled
* before we read the flag and returns.
* We can ensure that the handle is valid if fWaiting is true
* when reading the handle before the status flag.
*/
if ( !fWokenUp
&& fWaiting)
{
/*
* If a thread waits the handle must be valid.
* It is possible that the thread returns from
* rtFileAsyncIoLinuxGetEvents() before the signal
* is send.
* This is no problem because we already set fWokenUp
* to true which will let the thread return VERR_INTERRUPTED
* and the next call to RTFileAioCtxWait() will not
* return VERR_INTERRUPTED because signals are not saved
* and will simply vanish if the destination thread can't
* receive it.
*/
}
return VINF_SUCCESS;
}