fileaio-posix.cpp revision 3633fa76957c4740b8d4372e0dfc938d3c3ea569
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * IPRT - File async I/O, native implementation for POSIX compliant host platforms.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Copyright (C) 2006-2007 Sun Microsystems, Inc.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * This file is part of VirtualBox Open Source Edition (OSE), as
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * available from http://www.virtualbox.org. This file is free software;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * you can redistribute it and/or modify it under the terms of the GNU
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * General Public License (GPL) as published by the Free Software
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Foundation, in version 2 as it comes in the "COPYING" file of the
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * The contents of this file may alternatively be used under the terms
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * of the Common Development and Distribution License Version 1.0
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * VirtualBox OSE distribution, in which case the provisions of the
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * CDDL are applicable instead of those of the GPL.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * You may elect to license modified versions of this file under the
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * terms and conditions of either the GPL or the CDDL or both.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Clara, CA 95054 USA or visit http://www.sun.com if you need
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * additional information or have any questions.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync/*******************************************************************************
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync* Header Files *
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync*******************************************************************************/
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync * Linux does not define this value.
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync * Just define it with really big
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync/*******************************************************************************
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync* Structures and Typedefs *
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync*******************************************************************************/
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Async I/O request state.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** The aio control block. FIRST ELEMENT! */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Next element in the chain. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Flag whether this is a flush request. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Flag indicating if the request was canceled. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync volatile bool fCanceled;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Opaque user data. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Number of bytes actually transfered. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Status code. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Completion context we are assigned to. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Entry in the waiting list the request is in. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Magic value (RTFILEAIOREQ_MAGIC). */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Async I/O completion context state.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Current number of requests active on this context. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Maximum number of requests this context can handle. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** The ID of the thread which is currently waiting for requests. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Flag whether the thread was woken up. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync volatile bool fWokenUp;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Flag whether the thread is currently waiting in the syscall. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync volatile bool fWaiting;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Magic value (RTFILEAIOCTX_MAGIC). */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Flag whether the thread was woken up due to a internal event. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync volatile bool fWokenUpInternal;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** List of new requests which needs to be inserted into apReqs by the
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * waiting thread. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Special entry for requests which are canceled. Because only one
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * request can be canceled at a time and the thread canceling the request
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * has to wait we need only one entry. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Event semaphore the canceling thread is waiting for completion of
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * the operation. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** Number of elements in the waiting list. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** First free slot in the waiting list. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** List of requests we are currently waiting on.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Size depends on cMaxRequests. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Internal worker for waking up the waiting thread.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncstatic void rtFileAioCtxWakeup(PRTFILEAIOCTXINTERNAL pCtxInt)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Read the thread handle before the status flag.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * If we read the handle after the flag we might
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * end up with an invalid handle because the thread
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * waiting in RTFileAioCtxWakeup() might get scheduled
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * before we read the flag and returns.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * We can ensure that the handle is valid if fWaiting is true
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * when reading the handle before the status flag.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync ASMAtomicReadHandle(&pCtxInt->hThreadWait, &hThread);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync bool fWaiting = ASMAtomicReadBool(&pCtxInt->fWaiting);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * If a thread waits the handle must be valid.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * It is possible that the thread returns from
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * aio_suspend() before the signal is send.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * This is no problem because we already set fWokenUp
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * to true which will let the thread return VERR_INTERRUPTED
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * and the next call to RTFileAioCtxWait() will not
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * return VERR_INTERRUPTED because signals are not saved
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * and will simply vanish if the destination thread can't
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * receive it.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Internal worker processing events and inserting new requests into the waiting list.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncstatic int rtFileAioCtxProcessEvents(PRTFILEAIOCTXINTERNAL pCtxInt)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Process new requests first. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync bool fWokenUp = ASMAtomicXchgBool(&pCtxInt->fWokenUpInternal, false);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync for (unsigned iSlot = 0; iSlot < RT_ELEMENTS(pCtxInt->apReqsNewHead); iSlot++)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync PRTFILEAIOREQINTERNAL pReqHead = (PRTFILEAIOREQINTERNAL)ASMAtomicXchgPtr((void* volatile*)&pCtxInt->apReqsNewHead[iSlot],
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Clear pointer to next element just for safety. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync pCtxInt->apReqs[pCtxInt->iFirstFree]->pNext = NULL;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync Assert(pCtxInt->iFirstFree <= pCtxInt->cMaxRequests);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Check if a request needs to be canceled. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync PRTFILEAIOREQINTERNAL pReqToCancel = (PRTFILEAIOREQINTERNAL)ASMAtomicReadPtr((void* volatile*)&pCtxInt->pReqToCancel);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Put it out of the waiting list. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync pCtxInt->apReqs[pReqToCancel->iWaitingList] = pCtxInt->apReqs[--pCtxInt->iFirstFree];
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync pCtxInt->apReqs[pReqToCancel->iWaitingList]->iWaitingList = pReqToCancel->iWaitingList;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncRTR3DECL(int) RTFileAioReqCreate(PRTFILEAIOREQ phReq)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync PRTFILEAIOREQINTERNAL pReqInt = (PRTFILEAIOREQINTERNAL)RTMemAllocZ(sizeof(RTFILEAIOREQINTERNAL));
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Validate the handle and ignore nil.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Trash the magic and free it.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync ASMAtomicUoWriteU32(&pReqInt->u32Magic, ~RTFILEAIOREQ_MAGIC);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Worker setting up the request.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncDECLINLINE(int) rtFileAioReqPrepareTransfer(RTFILEAIOREQ hReq, RTFILE hFile,
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Validate the input.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync pReqInt->AioCB.aio_lio_opcode = uTransferDirection;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncRTDECL(int) RTFileAioReqPrepareRead(RTFILEAIOREQ hReq, RTFILE hFile, RTFOFF off,
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync return rtFileAioReqPrepareTransfer(hReq, hFile, LIO_READ,
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncRTDECL(int) RTFileAioReqPrepareWrite(RTFILEAIOREQ hReq, RTFILE hFile, RTFOFF off,
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync return rtFileAioReqPrepareTransfer(hReq, hFile, LIO_WRITE,
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncRTDECL(int) RTFileAioReqPrepareFlush(RTFILEAIOREQ hReq, RTFILE hFile, void *pvUser)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync PRTFILEAIOREQINTERNAL pReqInt = (PRTFILEAIOREQINTERNAL)hReq;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncRTDECL(void *) RTFileAioReqGetUser(RTFILEAIOREQ hReq)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync int rcPosix = aio_cancel(pReqInt->AioCB.aio_fildes, &pReqInt->AioCB);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Notify the waiting thread that the request was canceled.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync ("Invalid state. Request was canceled but wasn't submitted\n"));
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync ASMAtomicWritePtr((void* volatile*)&pCtxInt->pReqToCancel, pReqInt);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Wait for acknowledge. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync int rc = RTSemEventWait(pCtxInt->SemEventCancel, RT_INDEFINITE_WAIT);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync ASMAtomicWritePtr((void* volatile*)&pCtxInt->pReqToCancel, NULL);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncRTDECL(int) RTFileAioReqGetRC(RTFILEAIOREQ hReq, size_t *pcbTransfered)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncRTDECL(int) RTFileAioCtxCreate(PRTFILEAIOCTX phAioCtx, uint32_t cAioReqsMax)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync pCtxInt = (PRTFILEAIOCTXINTERNAL)RTMemAllocZ( sizeof(RTFILEAIOCTXINTERNAL)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Create event semaphore. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync int rc = RTSemEventCreate(&pCtxInt->SemEventCancel);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncRTDECL(int) RTFileAioCtxDestroy(RTFILEAIOCTX hAioCtx)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncRTDECL(uint32_t) RTFileAioCtxGetMaxReqCount(RTFILEAIOCTX hAioCtx)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncRTDECL(int) RTFileAioCtxAssociateWithFile(RTFILEAIOCTX hAioCtx, RTFILE hFile)
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsyncRTDECL(int) RTFileAioCtxSubmit(RTFILEAIOCTX hAioCtx, PRTFILEAIOREQ pahReqs, size_t cReqs, size_t *pcReqs)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Parameter checks */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Check that we don't exceed the limit */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync if (ASMAtomicUoReadS32(&pCtxInt->cRequests) + cReqs > pCtxInt->cMaxRequests)
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync while ( (i < cReqs)
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync /* Link them together. */
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync rcPosix = lio_listio(LIO_NOWAIT, (struct aiocb **)pahReqs, cReqsSubmit, NULL);
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync /* Check if we have a flush request now. */
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync * lio_listio does not work with flush requests so
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync * we have to use aio_fsync directly.
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync * Forward successfully submitted requests to the thread waiting for requests.
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync * We search for a free slot first and if we don't find one
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync * we will grab the first one and append our list to the existing entries.
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync unsigned iSlot = 0;
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync while ( (iSlot < RT_ELEMENTS(pCtxInt->apReqsNewHead))
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync && !ASMAtomicCmpXchgPtr((void * volatile *)&pCtxInt->apReqsNewHead[iSlot], pHead, NULL))
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync /* Nothing found. */
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync PRTFILEAIOREQINTERNAL pOldHead = (PRTFILEAIOREQINTERNAL)ASMAtomicXchgPtr((void * volatile *)&pCtxInt->apReqsNewHead[0],
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync /* Find the end of the current head and link the old list to the current. */
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync ASMAtomicXchgPtr((void * volatile *)&pCtxInt->apReqsNewHead[0], pHead);
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync /* Set the internal wakeup flag and wakeup the thread if possible. */
3633fa76957c4740b8d4372e0dfc938d3c3ea569vboxsync bool fWokenUp = ASMAtomicXchgBool(&pCtxInt->fWokenUpInternal, true);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncRTDECL(int) RTFileAioCtxWait(RTFILEAIOCTX hAioCtx, size_t cMinReqs, unsigned cMillisTimeout,
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync PRTFILEAIOREQ pahReqs, size_t cReqs, uint32_t *pcReqs)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync PRTFILEAIOCTXINTERNAL pCtxInt = (PRTFILEAIOCTXINTERNAL)hAioCtx;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Check parameters. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync AssertReturn(cReqs >= cMinReqs, VERR_OUT_OF_RANGE);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync if (RT_UNLIKELY(ASMAtomicReadS32(&pCtxInt->cRequests) == 0))
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync Timeout.tv_nsec = (cMillisTimeout % 1000) * 1000000;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Wait for at least one. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* For the wakeup call. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync ASMAtomicWriteHandle(&pCtxInt->hThreadWait, RTThreadSelf());
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Update the waiting list once before we enter the loop. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync int rcPosix = aio_suspend((const struct aiocb * const *)pCtxInt->apReqs,
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Check that this is an external wakeup event. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Requests finished. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync unsigned iReqCurr = 0;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Remove completed requests from the waiting list. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync PRTFILEAIOREQINTERNAL pReq = pCtxInt->apReqs[iReqCurr];
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Completed store the return code. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Call aio_return() to free ressources. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * Move the last entry into the current position to avoid holes
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * but only if it is not the last element already.
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync pCtxInt->apReqs[pReq->iWaitingList] = pCtxInt->apReqs[--pCtxInt->iFirstFree];
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync pCtxInt->apReqs[pReq->iWaitingList]->iWaitingList = pReq->iWaitingList;
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Put the request into the completed list. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync if ((cMillisTimeout != RT_INDEFINITE_WAIT) && (cMinReqs > 0))
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Recalculate the timeout. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync Timeout.tv_sec = Timeout.tv_sec - (TimeDiff / 1000000);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync Timeout.tv_nsec = Timeout.tv_nsec - (TimeDiff % 1000000);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /* Check for new elements. */
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync ASMAtomicWriteHandle(&pCtxInt->hThreadWait, NIL_RTTHREAD);
cba55d7782acd89472e03b4376591ff57fe7b80evboxsyncRTDECL(int) RTFileAioCtxWakeup(RTFILEAIOCTX hAioCtx)
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync /** @todo r=bird: Define the protocol for how to resume work after calling
cba55d7782acd89472e03b4376591ff57fe7b80evboxsync * this function. */