PDMAsyncCompletionFile.cpp revision a9f41cb889f53e8407561a6155052c441eb0fc5f
/* $Id$ */
/** @file
* PDM Async I/O - Transport data asynchronous in R3 using EMT.
*/
/*
* Copyright (C) 2006-2009 Sun Microsystems, Inc.
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* you can redistribute it and/or modify it under the terms of the GNU
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
* Clara, CA 95054 USA or visit http://www.sun.com if you need
* additional information or have any questions.
*/
/*******************************************************************************
* Header Files *
*******************************************************************************/
#define LOG_GROUP LOG_GROUP_PDM_ASYNC_COMPLETION
//#define DEBUG
#include "PDMInternal.h"
#include <VBox/pdm.h>
#include <VBox/mm.h>
#include <VBox/vm.h>
#include <VBox/err.h>
#include <VBox/log.h>
#include <iprt/asm.h>
#include <iprt/assert.h>
#include <iprt/critsect.h>
#include <iprt/env.h>
#include <iprt/file.h>
#include <iprt/mem.h>
#include <iprt/semaphore.h>
#include <iprt/string.h>
#include <iprt/thread.h>
#include <iprt/path.h>
#include "PDMAsyncCompletionFileInternal.h"
/**
* Frees a task.
*
* @returns nothing.
* @param pEndpoint Pointer to the endpoint the segment was for.
* @param pTask The task to free.
*/
void pdmacFileTaskFree(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint,
PPDMACTASKFILE pTask)
{
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClass = (PPDMASYNCCOMPLETIONEPCLASSFILE)pEndpoint->Core.pEpClass;
LogFlowFunc((": pEndpoint=%p pTask=%p\n", pEndpoint, pTask));
/* Try the per endpoint cache first. */
if (pEndpoint->cTasksCached < pEpClass->cTasksCacheMax)
{
/* Add it to the list. */
pEndpoint->pTasksFreeTail->pNext = pTask;
pEndpoint->pTasksFreeTail = pTask;
ASMAtomicIncU32(&pEndpoint->cTasksCached);
}
else
{
Log(("Freeing task %p because all caches are full\n", pTask));
MMR3HeapFree(pTask);
}
}
/**
* Allocates a task segment
*
* @returns Pointer to the new task segment or NULL
* @param pEndpoint Pointer to the endpoint
*/
PPDMACTASKFILE pdmacFileTaskAlloc(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint)
{
PPDMACTASKFILE pTask = NULL;
/* Try the small per endpoint cache first. */
if (pEndpoint->pTasksFreeHead == pEndpoint->pTasksFreeTail)
{
/* Try the bigger endpoint class cache. */
PPDMASYNCCOMPLETIONEPCLASSFILE pEndpointClass = (PPDMASYNCCOMPLETIONEPCLASSFILE)pEndpoint->Core.pEpClass;
/*
* Allocate completely new.
* If this fails we return NULL.
*/
int rc = MMR3HeapAllocZEx(pEndpointClass->Core.pVM, MM_TAG_PDM_ASYNC_COMPLETION,
sizeof(PDMACTASKFILE),
(void **)&pTask);
if (RT_FAILURE(rc))
pTask = NULL;
LogFlow(("Allocated task %p\n", pTask));
}
else
{
/* Grab a free task from the head. */
AssertMsg(pEndpoint->cTasksCached > 0, ("No tasks cached but list contains more than one element\n"));
pTask = pEndpoint->pTasksFreeHead;
pEndpoint->pTasksFreeHead = pTask->pNext;
ASMAtomicDecU32(&pEndpoint->cTasksCached);
}
pTask->pNext = NULL;
return pTask;
}
PPDMACTASKFILE pdmacFileEpGetNewTasks(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint)
{
PPDMACTASKFILE pTasks = NULL;
/*
* Get pending tasks.
*/
pTasks = (PPDMACTASKFILE)ASMAtomicXchgPtr((void * volatile *)&pEndpoint->pTasksNewHead, NULL);
/* Reverse the list to process in FIFO order. */
if (pTasks)
{
PPDMACTASKFILE pTask = pTasks;
pTasks = NULL;
while (pTask)
{
PPDMACTASKFILE pCur = pTask;
pTask = pTask->pNext;
pCur->pNext = pTasks;
pTasks = pCur;
}
}
return pTasks;
}
static void pdmacFileAioMgrWakeup(PPDMACEPFILEMGR pAioMgr)
{
bool fWokenUp = ASMAtomicXchgBool(&pAioMgr->fWokenUp, true);
if (!fWokenUp)
{
int rc = VINF_SUCCESS;
bool fWaitingEventSem = ASMAtomicReadBool(&pAioMgr->fWaitingEventSem);
if (fWaitingEventSem)
rc = RTSemEventSignal(pAioMgr->EventSem);
AssertRC(rc);
}
}
static int pdmacFileAioMgrWaitForBlockingEvent(PPDMACEPFILEMGR pAioMgr, PDMACEPFILEAIOMGRBLOCKINGEVENT enmEvent)
{
int rc = VINF_SUCCESS;
ASMAtomicWriteU32((volatile uint32_t *)&pAioMgr->enmBlockingEvent, enmEvent);
Assert(!pAioMgr->fBlockingEventPending);
ASMAtomicXchgBool(&pAioMgr->fBlockingEventPending, true);
/* Wakeup the async I/O manager */
pdmacFileAioMgrWakeup(pAioMgr);
/* Wait for completion. */
rc = RTSemEventWait(pAioMgr->EventSemBlock, RT_INDEFINITE_WAIT);
AssertRC(rc);
ASMAtomicXchgBool(&pAioMgr->fBlockingEventPending, false);
ASMAtomicWriteU32((volatile uint32_t *)&pAioMgr->enmBlockingEvent, PDMACEPFILEAIOMGRBLOCKINGEVENT_INVALID);
return rc;
}
int pdmacFileAioMgrAddEndpoint(PPDMACEPFILEMGR pAioMgr, PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint)
{
int rc;
rc = RTCritSectEnter(&pAioMgr->CritSectBlockingEvent);
AssertRCReturn(rc, rc);
ASMAtomicWritePtr((void * volatile *)&pAioMgr->BlockingEventData.AddEndpoint.pEndpoint, pEndpoint);
rc = pdmacFileAioMgrWaitForBlockingEvent(pAioMgr, PDMACEPFILEAIOMGRBLOCKINGEVENT_ADD_ENDPOINT);
RTCritSectLeave(&pAioMgr->CritSectBlockingEvent);
if (RT_SUCCESS(rc))
ASMAtomicWritePtr((void * volatile *)&pEndpoint->pAioMgr, pAioMgr);
return rc;
}
static int pdmacFileAioMgrRemoveEndpoint(PPDMACEPFILEMGR pAioMgr, PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint)
{
int rc;
rc = RTCritSectEnter(&pAioMgr->CritSectBlockingEvent);
AssertRCReturn(rc, rc);
ASMAtomicWritePtr((void * volatile *)&pAioMgr->BlockingEventData.RemoveEndpoint.pEndpoint, pEndpoint);
rc = pdmacFileAioMgrWaitForBlockingEvent(pAioMgr, PDMACEPFILEAIOMGRBLOCKINGEVENT_REMOVE_ENDPOINT);
RTCritSectLeave(&pAioMgr->CritSectBlockingEvent);
return rc;
}
static int pdmacFileAioMgrCloseEndpoint(PPDMACEPFILEMGR pAioMgr, PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint)
{
int rc;
rc = RTCritSectEnter(&pAioMgr->CritSectBlockingEvent);
AssertRCReturn(rc, rc);
ASMAtomicWritePtr((void * volatile *)&pAioMgr->BlockingEventData.CloseEndpoint.pEndpoint, pEndpoint);
rc = pdmacFileAioMgrWaitForBlockingEvent(pAioMgr, PDMACEPFILEAIOMGRBLOCKINGEVENT_CLOSE_ENDPOINT);
RTCritSectLeave(&pAioMgr->CritSectBlockingEvent);
return rc;
}
static int pdmacFileAioMgrShutdown(PPDMACEPFILEMGR pAioMgr)
{
int rc;
rc = RTCritSectEnter(&pAioMgr->CritSectBlockingEvent);
AssertRCReturn(rc, rc);
rc = pdmacFileAioMgrWaitForBlockingEvent(pAioMgr, PDMACEPFILEAIOMGRBLOCKINGEVENT_SHUTDOWN);
RTCritSectLeave(&pAioMgr->CritSectBlockingEvent);
return rc;
}
int pdmacFileEpAddTask(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMACTASKFILE pTask)
{
PPDMACTASKFILE pNext;
do
{
pNext = pEndpoint->pTasksNewHead;
pTask->pNext = pNext;
} while (!ASMAtomicCmpXchgPtr((void * volatile *)&pEndpoint->pTasksNewHead, (void *)pTask, (void *)pNext));
pdmacFileAioMgrWakeup((PPDMACEPFILEMGR)ASMAtomicReadPtr((void * volatile *)&pEndpoint->pAioMgr));
return VINF_SUCCESS;
}
void pdmacFileEpTaskCompleted(PPDMACTASKFILE pTask, void *pvUser, int rc)
{
PPDMASYNCCOMPLETIONTASKFILE pTaskFile = (PPDMASYNCCOMPLETIONTASKFILE)pvUser;
if (pTask->enmTransferType == PDMACTASKFILETRANSFER_FLUSH)
{
pdmR3AsyncCompletionCompleteTask(&pTaskFile->Core, rc, true);
}
else
{
Assert((uint32_t)pTask->DataSeg.cbSeg == pTask->DataSeg.cbSeg && (int32_t)pTask->DataSeg.cbSeg >= 0);
uint32_t uOld = ASMAtomicSubS32(&pTaskFile->cbTransferLeft, (int32_t)pTask->DataSeg.cbSeg);
/* The first error will be returned. */
if (RT_FAILURE(rc))
ASMAtomicCmpXchgS32(&pTaskFile->rc, rc, VINF_SUCCESS);
if (!(uOld - pTask->DataSeg.cbSeg)
&& !ASMAtomicXchgBool(&pTaskFile->fCompleted, true))
pdmR3AsyncCompletionCompleteTask(&pTaskFile->Core, pTaskFile->rc, true);
}
}
int pdmacFileEpTaskInitiate(PPDMASYNCCOMPLETIONTASK pTask,
PPDMASYNCCOMPLETIONENDPOINT pEndpoint, RTFOFF off,
PCRTSGSEG paSegments, size_t cSegments,
size_t cbTransfer, PDMACTASKFILETRANSFER enmTransfer)
{
int rc = VINF_SUCCESS;
PPDMASYNCCOMPLETIONENDPOINTFILE pEpFile = (PPDMASYNCCOMPLETIONENDPOINTFILE)pEndpoint;
PPDMASYNCCOMPLETIONTASKFILE pTaskFile = (PPDMASYNCCOMPLETIONTASKFILE)pTask;
PPDMACEPFILEMGR pAioMgr = pEpFile->pAioMgr;
Assert( (enmTransfer == PDMACTASKFILETRANSFER_READ)
|| (enmTransfer == PDMACTASKFILETRANSFER_WRITE));
Assert((uint32_t)cbTransfer == cbTransfer && (int32_t)cbTransfer >= 0);
ASMAtomicWriteS32(&pTaskFile->cbTransferLeft, (int32_t)cbTransfer);
ASMAtomicWriteBool(&pTaskFile->fCompleted, false);
ASMAtomicWriteS32(&pTaskFile->rc, VINF_SUCCESS);
for (unsigned i = 0; i < cSegments; i++)
{
PPDMACTASKFILE pIoTask = pdmacFileTaskAlloc(pEpFile);
AssertPtr(pIoTask);
pIoTask->pEndpoint = pEpFile;
pIoTask->enmTransferType = enmTransfer;
pIoTask->Off = off;
pIoTask->DataSeg.cbSeg = paSegments[i].cbSeg;
pIoTask->DataSeg.pvSeg = paSegments[i].pvSeg;
pIoTask->pvUser = pTaskFile;
pIoTask->pfnCompleted = pdmacFileEpTaskCompleted;
/* Send it off to the I/O manager. */
pdmacFileEpAddTask(pEpFile, pIoTask);
off += paSegments[i].cbSeg;
cbTransfer -= paSegments[i].cbSeg;
}
AssertMsg(!cbTransfer, ("Incomplete transfer %u bytes left\n", cbTransfer));
if (ASMAtomicReadS32(&pTaskFile->cbTransferLeft) == 0
&& !ASMAtomicXchgBool(&pTaskFile->fCompleted, true))
pdmR3AsyncCompletionCompleteTask(pTask, pTaskFile->rc, false);
else
rc = VINF_AIO_TASK_PENDING;
return rc;
}
/**
* Creates a new async I/O manager.
*
* @returns VBox status code.
* @param pEpClass Pointer to the endpoint class data.
* @param ppAioMgr Where to store the pointer to the new async I/O manager on success.
* @param enmMgrType Wanted manager type - can be overwritten by the global override.
*/
int pdmacFileAioMgrCreate(PPDMASYNCCOMPLETIONEPCLASSFILE pEpClass, PPPDMACEPFILEMGR ppAioMgr,
PDMACEPFILEMGRTYPE enmMgrType)
{
int rc = VINF_SUCCESS;
PPDMACEPFILEMGR pAioMgrNew;
LogFlowFunc((": Entered\n"));
rc = MMR3HeapAllocZEx(pEpClass->Core.pVM, MM_TAG_PDM_ASYNC_COMPLETION, sizeof(PDMACEPFILEMGR), (void **)&pAioMgrNew);
if (RT_SUCCESS(rc))
{
if (enmMgrType < pEpClass->enmMgrTypeOverride)
pAioMgrNew->enmMgrType = enmMgrType;
else
pAioMgrNew->enmMgrType = pEpClass->enmMgrTypeOverride;
rc = RTSemEventCreate(&pAioMgrNew->EventSem);
if (RT_SUCCESS(rc))
{
rc = RTSemEventCreate(&pAioMgrNew->EventSemBlock);
if (RT_SUCCESS(rc))
{
rc = RTCritSectInit(&pAioMgrNew->CritSectBlockingEvent);
if (RT_SUCCESS(rc))
{
/* Init the rest of the manager. */
if (pAioMgrNew->enmMgrType != PDMACEPFILEMGRTYPE_SIMPLE)
rc = pdmacFileAioMgrNormalInit(pAioMgrNew);
if (RT_SUCCESS(rc))
{
pAioMgrNew->enmState = PDMACEPFILEMGRSTATE_RUNNING;
rc = RTThreadCreateF(&pAioMgrNew->Thread,
pAioMgrNew->enmMgrType == PDMACEPFILEMGRTYPE_SIMPLE
? pdmacFileAioMgrFailsafe
: pdmacFileAioMgrNormal,
pAioMgrNew,
0,
RTTHREADTYPE_IO,
0,
"AioMgr%d-%s", pEpClass->cAioMgrs,
pAioMgrNew->enmMgrType == PDMACEPFILEMGRTYPE_SIMPLE
? "F"
: "N");
if (RT_SUCCESS(rc))
{
/* Link it into the list. */
RTCritSectEnter(&pEpClass->CritSect);
pAioMgrNew->pNext = pEpClass->pAioMgrHead;
if (pEpClass->pAioMgrHead)
pEpClass->pAioMgrHead->pPrev = pAioMgrNew;
pEpClass->pAioMgrHead = pAioMgrNew;
pEpClass->cAioMgrs++;
RTCritSectLeave(&pEpClass->CritSect);
*ppAioMgr = pAioMgrNew;
Log(("PDMAC: Successfully created new file AIO Mgr {%s}\n", RTThreadGetName(pAioMgrNew->Thread)));
return VINF_SUCCESS;
}
pdmacFileAioMgrNormalDestroy(pAioMgrNew);
}
RTCritSectDelete(&pAioMgrNew->CritSectBlockingEvent);
}
RTSemEventDestroy(pAioMgrNew->EventSem);
}
RTSemEventDestroy(pAioMgrNew->EventSemBlock);
}
MMR3HeapFree(pAioMgrNew);
}
LogFlowFunc((": Leave rc=%Rrc\n", rc));
return rc;
}
/**
* Destroys a async I/O manager.
*
* @returns nothing.
* @param pAioMgr The async I/O manager to destroy.
*/
static void pdmacFileAioMgrDestroy(PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile, PPDMACEPFILEMGR pAioMgr)
{
int rc = pdmacFileAioMgrShutdown(pAioMgr);
AssertRC(rc);
/* Unlink from the list. */
rc = RTCritSectEnter(&pEpClassFile->CritSect);
AssertRC(rc);
PPDMACEPFILEMGR pPrev = pAioMgr->pPrev;
PPDMACEPFILEMGR pNext = pAioMgr->pNext;
if (pPrev)
pPrev->pNext = pNext;
else
pEpClassFile->pAioMgrHead = pNext;
if (pNext)
pNext->pPrev = pPrev;
pEpClassFile->cAioMgrs--;
rc = RTCritSectLeave(&pEpClassFile->CritSect);
AssertRC(rc);
/* Free the ressources. */
RTCritSectDelete(&pAioMgr->CritSectBlockingEvent);
RTSemEventDestroy(pAioMgr->EventSem);
if (pAioMgr->enmMgrType != PDMACEPFILEMGRTYPE_SIMPLE)
pdmacFileAioMgrNormalDestroy(pAioMgr);
MMR3HeapFree(pAioMgr);
}
static int pdmacFileBwMgrInitialize(PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile,
PCFGMNODE pCfgNode, PPPDMACFILEBWMGR ppBwMgr)
{
int rc = VINF_SUCCESS;
PPDMACFILEBWMGR pBwMgr = NULL;
rc = MMR3HeapAllocZEx(pEpClassFile->Core.pVM, MM_TAG_PDM_ASYNC_COMPLETION,
sizeof(PDMACFILEBWMGR),
(void **)&pBwMgr);
if (RT_SUCCESS(rc))
{
/* Init I/O flow control. */
rc = CFGMR3QueryU32Def(pCfgNode, "VMTransferPerSecMax", &pBwMgr->cbVMTransferPerSecMax, UINT32_MAX);
AssertLogRelRCReturn(rc, rc);
rc = CFGMR3QueryU32Def(pCfgNode, "VMTransferPerSecStart", &pBwMgr->cbVMTransferPerSecStart, UINT32_MAX /*5 * _1M*/);
AssertLogRelRCReturn(rc, rc);
rc = CFGMR3QueryU32Def(pCfgNode, "VMTransferPerSecStep", &pBwMgr->cbVMTransferPerSecStep, _1M);
AssertLogRelRCReturn(rc, rc);
pBwMgr->cbVMTransferAllowed = pBwMgr->cbVMTransferPerSecStart;
pBwMgr->tsUpdatedLast = RTTimeSystemNanoTS();
if (pBwMgr->cbVMTransferPerSecMax != UINT32_MAX)
LogRel(("AIOMgr: I/O bandwidth limited to %u bytes/sec\n", pBwMgr->cbVMTransferPerSecMax));
else
LogRel(("AIOMgr: I/O bandwidth not limited\n"));
*ppBwMgr = pBwMgr;
}
return rc;
}
static void pdmacFileBwMgrDestroy(PPDMACFILEBWMGR pBwMgr)
{
MMR3HeapFree(pBwMgr);
}
static void pdmacFileBwRef(PPDMACFILEBWMGR pBwMgr)
{
pBwMgr->cRefs++;
}
static void pdmacFileBwUnref(PPDMACFILEBWMGR pBwMgr)
{
Assert(pBwMgr->cRefs > 0);
pBwMgr->cRefs--;
}
bool pdmacFileBwMgrIsTransferAllowed(PPDMACFILEBWMGR pBwMgr, uint32_t cbTransfer)
{
bool fAllowed = false;
LogFlowFunc(("pBwMgr=%p cbTransfer=%u\n", pBwMgr, cbTransfer));
uint32_t cbOld = ASMAtomicSubU32(&pBwMgr->cbVMTransferAllowed, cbTransfer);
if (RT_LIKELY(cbOld >= cbTransfer))
fAllowed = true;
else
{
/* We are out of ressources Check if we can update again. */
uint64_t tsNow = RTTimeSystemNanoTS();
uint64_t tsUpdatedLast = ASMAtomicUoReadU64(&pBwMgr->tsUpdatedLast);
if (tsNow - tsUpdatedLast >= (1000*1000*1000))
{
if (ASMAtomicCmpXchgU64(&pBwMgr->tsUpdatedLast, tsNow, tsUpdatedLast))
{
if (pBwMgr->cbVMTransferPerSecStart < pBwMgr->cbVMTransferPerSecMax)
{
pBwMgr->cbVMTransferPerSecStart = RT_MIN(pBwMgr->cbVMTransferPerSecMax, pBwMgr->cbVMTransferPerSecStart + pBwMgr->cbVMTransferPerSecStep);
LogFlow(("AIOMgr: Increasing maximum bandwidth to %u bytes/sec\n", pBwMgr->cbVMTransferPerSecStart));
}
/* Update */
ASMAtomicWriteU32(&pBwMgr->cbVMTransferAllowed, pBwMgr->cbVMTransferPerSecStart - cbTransfer);
fAllowed = true;
LogFlow(("AIOMgr: Refreshed bandwidth\n"));
}
}
else
ASMAtomicAddU32(&pBwMgr->cbVMTransferAllowed, cbTransfer);
}
LogFlowFunc(("fAllowed=%RTbool\n", fAllowed));
return fAllowed;
}
static int pdmacFileMgrTypeFromName(const char *pszVal, PPDMACEPFILEMGRTYPE penmMgrType)
{
int rc = VINF_SUCCESS;
if (!RTStrCmp(pszVal, "Simple"))
*penmMgrType = PDMACEPFILEMGRTYPE_SIMPLE;
else if (!RTStrCmp(pszVal, "Async"))
*penmMgrType = PDMACEPFILEMGRTYPE_ASYNC;
else
rc = VERR_CFGM_CONFIG_UNKNOWN_VALUE;
return rc;
}
static const char *pdmacFileMgrTypeToName(PDMACEPFILEMGRTYPE enmMgrType)
{
if (enmMgrType == PDMACEPFILEMGRTYPE_SIMPLE)
return "Simple";
if (enmMgrType == PDMACEPFILEMGRTYPE_ASYNC)
return "Async";
return NULL;
}
static int pdmacFileBackendTypeFromName(const char *pszVal, PPDMACFILEEPBACKEND penmBackendType)
{
int rc = VINF_SUCCESS;
if (!RTStrCmp(pszVal, "Buffered"))
*penmBackendType = PDMACFILEEPBACKEND_BUFFERED;
else if (!RTStrCmp(pszVal, "NonBuffered"))
*penmBackendType = PDMACFILEEPBACKEND_NON_BUFFERED;
else
rc = VERR_CFGM_CONFIG_UNKNOWN_VALUE;
return rc;
}
static const char *pdmacFileBackendTypeToName(PDMACFILEEPBACKEND enmBackendType)
{
if (enmBackendType == PDMACFILEEPBACKEND_BUFFERED)
return "Buffered";
if (enmBackendType == PDMACFILEEPBACKEND_NON_BUFFERED)
return "NonBuffered";
return NULL;
}
static int pdmacFileInitialize(PPDMASYNCCOMPLETIONEPCLASS pClassGlobals, PCFGMNODE pCfgNode)
{
int rc = VINF_SUCCESS;
RTFILEAIOLIMITS AioLimits; /** < Async I/O limitations. */
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pClassGlobals;
rc = RTFileAioGetLimits(&AioLimits);
#ifdef DEBUG
if (RT_SUCCESS(rc) && RTEnvExist("VBOX_ASYNC_IO_FAILBACK"))
rc = VERR_ENV_VAR_NOT_FOUND;
#endif
if (RT_FAILURE(rc))
{
LogRel(("AIO: Async I/O manager not supported (rc=%Rrc). Falling back to simple manager\n",
rc));
pEpClassFile->enmMgrTypeOverride = PDMACEPFILEMGRTYPE_SIMPLE;
pEpClassFile->enmEpBackendDefault = PDMACFILEEPBACKEND_BUFFERED;
}
else
{
pEpClassFile->uBitmaskAlignment = AioLimits.cbBufferAlignment ? ~((RTR3UINTPTR)AioLimits.cbBufferAlignment - 1) : RTR3UINTPTR_MAX;
pEpClassFile->cReqsOutstandingMax = AioLimits.cReqsOutstandingMax;
if (pCfgNode)
{
/* Query the default manager type */
char *pszVal = NULL;
rc = CFGMR3QueryStringAllocDef(pCfgNode, "IoMgr", &pszVal, "Async");
AssertLogRelRCReturn(rc, rc);
rc = pdmacFileMgrTypeFromName(pszVal, &pEpClassFile->enmMgrTypeOverride);
MMR3HeapFree(pszVal);
if (RT_FAILURE(rc))
return rc;
LogRel(("AIOMgr: Default manager type is \"%s\"\n", pdmacFileMgrTypeToName(pEpClassFile->enmMgrTypeOverride)));
/* Query default backend type */
#ifndef RT_OS_LINUX
rc = CFGMR3QueryStringAllocDef(pCfgNode, "FileBackend", &pszVal, "Buffered");
#else /* Linux can't use buffered with async */
rc = CFGMR3QueryStringAllocDef(pCfgNode, "FileBackend", &pszVal, "NonBuffered");
#endif
AssertLogRelRCReturn(rc, rc);
rc = pdmacFileBackendTypeFromName(pszVal, &pEpClassFile->enmEpBackendDefault);
MMR3HeapFree(pszVal);
if (RT_FAILURE(rc))
return rc;
LogRel(("AIOMgr: Default file backend is \"%s\"\n", pdmacFileBackendTypeToName(pEpClassFile->enmEpBackendDefault)));
#ifdef RT_OS_LINUX
if ( pEpClassFile->enmMgrTypeOverride == PDMACEPFILEMGRTYPE_ASYNC
&& pEpClassFile->enmEpBackendDefault == PDMACFILEEPBACKEND_BUFFERED)
{
LogRel(("AIOMgr: Linux does not support buffered async I/O, changing to non buffered\n"));
pEpClassFile->enmEpBackendDefault = PDMACFILEEPBACKEND_NON_BUFFERED;
}
#endif
}
else
{
/* No configuration supplied, set defaults */
pEpClassFile->enmMgrTypeOverride = PDMACEPFILEMGRTYPE_ASYNC;
#ifdef RT_OS_LINUX
pEpClassFile->enmEpBackendDefault = PDMACFILEEPBACKEND_NON_BUFFERED;
#else
pEpClassFile->enmEpBackendDefault = PDMACFILEEPBACKEND_BUFFERED;
#endif
}
}
/* Init critical section. */
rc = RTCritSectInit(&pEpClassFile->CritSect);
if (RT_SUCCESS(rc))
{
/* Check if the cache was disabled by the user. */
rc = CFGMR3QueryBoolDef(pCfgNode, "CacheEnabled", &pEpClassFile->fCacheEnabled, true);
AssertLogRelRCReturn(rc, rc);
if (pEpClassFile->fCacheEnabled)
{
/* Init cache structure */
rc = pdmacFileCacheInit(pEpClassFile, pCfgNode);
if (RT_FAILURE(rc))
{
pEpClassFile->fCacheEnabled = false;
LogRel(("AIOMgr: Failed to initialise the cache (rc=%Rrc), disabled caching\n"));
}
}
else
LogRel(("AIOMgr: Cache was globally disabled\n"));
rc = pdmacFileBwMgrInitialize(pEpClassFile, pCfgNode, &pEpClassFile->pBwMgr);
if (RT_FAILURE(rc))
RTCritSectDelete(&pEpClassFile->CritSect);
}
return rc;
}
static void pdmacFileTerminate(PPDMASYNCCOMPLETIONEPCLASS pClassGlobals)
{
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pClassGlobals;
/* All endpoints should be closed at this point. */
AssertMsg(!pEpClassFile->Core.pEndpointsHead, ("There are still endpoints left\n"));
/* Destroy all left async I/O managers. */
while (pEpClassFile->pAioMgrHead)
pdmacFileAioMgrDestroy(pEpClassFile, pEpClassFile->pAioMgrHead);
/* Destroy the cache. */
if (pEpClassFile->fCacheEnabled)
pdmacFileCacheDestroy(pEpClassFile);
RTCritSectDelete(&pEpClassFile->CritSect);
pdmacFileBwMgrDestroy(pEpClassFile->pBwMgr);
}
static int pdmacFileEpInitialize(PPDMASYNCCOMPLETIONENDPOINT pEndpoint,
const char *pszUri, uint32_t fFlags)
{
int rc = VINF_SUCCESS;
PPDMASYNCCOMPLETIONENDPOINTFILE pEpFile = (PPDMASYNCCOMPLETIONENDPOINTFILE)pEndpoint;
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pEndpoint->pEpClass;
PDMACEPFILEMGRTYPE enmMgrType = pEpClassFile->enmMgrTypeOverride;
PDMACFILEEPBACKEND enmEpBackend = pEpClassFile->enmEpBackendDefault;
AssertMsgReturn((fFlags & ~(PDMACEP_FILE_FLAGS_READ_ONLY | PDMACEP_FILE_FLAGS_CACHING)) == 0,
("PDMAsyncCompletion: Invalid flag specified\n"), VERR_INVALID_PARAMETER);
unsigned fFileFlags = fFlags & PDMACEP_FILE_FLAGS_READ_ONLY
? RTFILE_O_READ | RTFILE_O_OPEN | RTFILE_O_DENY_NONE
: RTFILE_O_READWRITE | RTFILE_O_OPEN | RTFILE_O_DENY_WRITE;
if (enmMgrType == PDMACEPFILEMGRTYPE_ASYNC)
fFileFlags |= RTFILE_O_ASYNC_IO;
if (enmEpBackend == PDMACFILEEPBACKEND_NON_BUFFERED)
{
/*
* We only disable the cache if the size of the file is a multiple of 512.
* Certain hosts like Windows, Linux and Solaris require that transfer sizes
* are aligned to the volume sector size.
* If not we just make sure that the data is written to disk with RTFILE_O_WRITE_THROUGH
* which will trash the host cache but ensures that the host cache will not
* contain dirty buffers.
*/
RTFILE File = NIL_RTFILE;
rc = RTFileOpen(&File, pszUri, RTFILE_O_READ | RTFILE_O_OPEN | RTFILE_O_DENY_NONE);
if (RT_SUCCESS(rc))
{
uint64_t cbSize;
rc = RTFileGetSize(File, &cbSize);
if (RT_SUCCESS(rc) && ((cbSize % 512) == 0))
fFileFlags |= RTFILE_O_NO_CACHE;
else
{
/* Downgrade to the buffered backend */
enmEpBackend = PDMACFILEEPBACKEND_BUFFERED;
#ifdef RT_OS_LINUX
fFileFlags &= ~RTFILE_O_ASYNC_IO;
enmMgrType = PDMACEPFILEMGRTYPE_SIMPLE;
#endif
}
RTFileClose(File);
}
}
/* Open with final flags. */
rc = RTFileOpen(&pEpFile->File, pszUri, fFileFlags);
if ((rc == VERR_INVALID_FUNCTION) || (rc == VERR_INVALID_PARAMETER))
{
LogRel(("pdmacFileEpInitialize: RTFileOpen %s / %08x failed with %Rrc\n",
pszUri, fFileFlags, rc));
/*
* Solaris doesn't support directio on ZFS so far. :-\
* Trying to enable it returns VERR_INVALID_FUNCTION
* (ENOTTY). Remove it and hope for the best.
* ZFS supports write throttling in case applications
* write more data than can be synced to the disk
* without blocking the whole application.
*
* On Linux we have the same problem with cifs.
* Have to disable async I/O here too because it requires O_DIRECT.
*/
fFileFlags &= ~RTFILE_O_NO_CACHE;
enmEpBackend = PDMACFILEEPBACKEND_BUFFERED;
#ifdef RT_OS_LINUX
fFileFlags &= ~RTFILE_O_ASYNC_IO;
enmMgrType = PDMACEPFILEMGRTYPE_SIMPLE;
#endif
/* Open again. */
rc = RTFileOpen(&pEpFile->File, pszUri, fFileFlags);
if (RT_FAILURE(rc))
{
LogRel(("pdmacFileEpInitialize: RTFileOpen %s / %08x failed AGAIN(!) with %Rrc\n",
pszUri, fFileFlags, rc));
}
}
if (RT_SUCCESS(rc))
{
pEpFile->fFlags = fFileFlags;
rc = RTFileGetSize(pEpFile->File, (uint64_t *)&pEpFile->cbFile);
if (RT_SUCCESS(rc) && (pEpFile->cbFile == 0))
{
/* Could be a block device */
rc = RTFileSeek(pEpFile->File, 0, RTFILE_SEEK_END, (uint64_t *)&pEpFile->cbFile);
}
if (RT_SUCCESS(rc))
{
/* Initialize the segment cache */
rc = MMR3HeapAllocZEx(pEpClassFile->Core.pVM, MM_TAG_PDM_ASYNC_COMPLETION,
sizeof(PDMACTASKFILE),
(void **)&pEpFile->pTasksFreeHead);
if (RT_SUCCESS(rc))
{
PPDMACEPFILEMGR pAioMgr = NULL;
pEpFile->cbEndpoint = pEpFile->cbFile;
pEpFile->pTasksFreeTail = pEpFile->pTasksFreeHead;
pEpFile->cTasksCached = 0;
pEpFile->pBwMgr = pEpClassFile->pBwMgr;
pEpFile->enmBackendType = enmEpBackend;
pdmacFileBwRef(pEpFile->pBwMgr);
if (enmMgrType == PDMACEPFILEMGRTYPE_SIMPLE)
{
/* Simple mode. Every file has its own async I/O manager. */
rc = pdmacFileAioMgrCreate(pEpClassFile, &pAioMgr, PDMACEPFILEMGRTYPE_SIMPLE);
AssertRC(rc);
}
else
{
if ( (fFlags & PDMACEP_FILE_FLAGS_CACHING)
&& (pEpClassFile->fCacheEnabled))
{
pEpFile->fCaching = true;
rc = pdmacFileEpCacheInit(pEpFile, pEpClassFile);
if (RT_FAILURE(rc))
{
LogRel(("AIOMgr: Endpoint for \"%s\" was opened with caching but initializing cache failed. Disabled caching\n", pszUri));
pEpFile->fCaching = false;
}
}
pAioMgr = pEpClassFile->pAioMgrHead;
/* Check for an idling manager of the same type */
while (pAioMgr)
{
if (pAioMgr->enmMgrType == enmMgrType)
break;
pAioMgr = pAioMgr->pNext;
}
if (!pAioMgr)
{
rc = pdmacFileAioMgrCreate(pEpClassFile, &pAioMgr, enmMgrType);
AssertRC(rc);
}
}
pEpFile->AioMgr.pTreeRangesLocked = (PAVLRFOFFTREE)RTMemAllocZ(sizeof(AVLRFOFFTREE));
if (!pEpFile->AioMgr.pTreeRangesLocked)
rc = VERR_NO_MEMORY;
else
{
pEpFile->enmState = PDMASYNCCOMPLETIONENDPOINTFILESTATE_ACTIVE;
/* Assign the endpoint to the thread. */
rc = pdmacFileAioMgrAddEndpoint(pAioMgr, pEpFile);
if (RT_FAILURE(rc))
{
RTMemFree(pEpFile->AioMgr.pTreeRangesLocked);
MMR3HeapFree(pEpFile->pTasksFreeHead);
pdmacFileBwUnref(pEpFile->pBwMgr);
}
}
}
}
if (RT_FAILURE(rc))
RTFileClose(pEpFile->File);
}
#ifdef VBOX_WITH_STATISTICS
if (RT_SUCCESS(rc))
{
STAMR3RegisterF(pEpClassFile->Core.pVM, &pEpFile->StatRead,
STAMTYPE_PROFILE_ADV, STAMVISIBILITY_ALWAYS,
STAMUNIT_TICKS_PER_CALL, "Time taken to read from the endpoint",
"/PDM/AsyncCompletion/File/%s/Read", RTPathFilename(pEpFile->Core.pszUri));
STAMR3RegisterF(pEpClassFile->Core.pVM, &pEpFile->StatWrite,
STAMTYPE_PROFILE_ADV, STAMVISIBILITY_ALWAYS,
STAMUNIT_TICKS_PER_CALL, "Time taken to write to the endpoint",
"/PDM/AsyncCompletion/File/%s/Write", RTPathFilename(pEpFile->Core.pszUri));
}
#endif
if (RT_SUCCESS(rc))
LogRel(("AIOMgr: Endpoint for file '%s' (flags %08x) created successfully\n", pszUri, pEpFile->fFlags));
return rc;
}
static int pdmacFileEpRangesLockedDestroy(PAVLRFOFFNODECORE pNode, void *pvUser)
{
AssertMsgFailed(("The locked ranges tree should be empty at that point\n"));
return VINF_SUCCESS;
}
static int pdmacFileEpClose(PPDMASYNCCOMPLETIONENDPOINT pEndpoint)
{
PPDMASYNCCOMPLETIONENDPOINTFILE pEpFile = (PPDMASYNCCOMPLETIONENDPOINTFILE)pEndpoint;
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pEndpoint->pEpClass;
/* Make sure that all tasks finished for this endpoint. */
int rc = pdmacFileAioMgrCloseEndpoint(pEpFile->pAioMgr, pEpFile);
AssertRC(rc);
/* endpoint and real file size should better be equal now. */
AssertMsg(pEpFile->cbFile == pEpFile->cbEndpoint,
("Endpoint and real file size should match now!\n"));
/*
* If the async I/O manager is in failsafe mode this is the only endpoint
* he processes and thus can be destroyed now.
*/
if (pEpFile->pAioMgr->enmMgrType == PDMACEPFILEMGRTYPE_SIMPLE)
pdmacFileAioMgrDestroy(pEpClassFile, pEpFile->pAioMgr);
/* Free cached tasks. */
PPDMACTASKFILE pTask = pEpFile->pTasksFreeHead;
while (pTask)
{
PPDMACTASKFILE pTaskFree = pTask;
pTask = pTask->pNext;
MMR3HeapFree(pTaskFree);
}
/* Free the cached data. */
if (pEpFile->fCaching)
pdmacFileEpCacheDestroy(pEpFile);
/* Remove from the bandwidth manager */
pdmacFileBwUnref(pEpFile->pBwMgr);
/* Destroy the locked ranges tree now. */
RTAvlrFileOffsetDestroy(pEpFile->AioMgr.pTreeRangesLocked, pdmacFileEpRangesLockedDestroy, NULL);
RTFileClose(pEpFile->File);
#ifdef VBOX_WITH_STATISTICS
STAMR3Deregister(pEpClassFile->Core.pVM, &pEpFile->StatRead);
STAMR3Deregister(pEpClassFile->Core.pVM, &pEpFile->StatWrite);
#endif
return VINF_SUCCESS;
}
static int pdmacFileEpRead(PPDMASYNCCOMPLETIONTASK pTask,
PPDMASYNCCOMPLETIONENDPOINT pEndpoint, RTFOFF off,
PCRTSGSEG paSegments, size_t cSegments,
size_t cbRead)
{
int rc = VINF_SUCCESS;
PPDMASYNCCOMPLETIONENDPOINTFILE pEpFile = (PPDMASYNCCOMPLETIONENDPOINTFILE)pEndpoint;
STAM_PROFILE_ADV_START(&pEpFile->StatRead, Read);
if (pEpFile->fCaching)
rc = pdmacFileEpCacheRead(pEpFile, (PPDMASYNCCOMPLETIONTASKFILE)pTask,
off, paSegments, cSegments, cbRead);
else
rc = pdmacFileEpTaskInitiate(pTask, pEndpoint, off, paSegments, cSegments, cbRead,
PDMACTASKFILETRANSFER_READ);
STAM_PROFILE_ADV_STOP(&pEpFile->StatRead, Read);
return rc;
}
static int pdmacFileEpWrite(PPDMASYNCCOMPLETIONTASK pTask,
PPDMASYNCCOMPLETIONENDPOINT pEndpoint, RTFOFF off,
PCRTSGSEG paSegments, size_t cSegments,
size_t cbWrite)
{
int rc = VINF_SUCCESS;
PPDMASYNCCOMPLETIONENDPOINTFILE pEpFile = (PPDMASYNCCOMPLETIONENDPOINTFILE)pEndpoint;
if (RT_UNLIKELY(pEpFile->fReadonly))
return VERR_NOT_SUPPORTED;
STAM_PROFILE_ADV_START(&pEpFile->StatWrite, Write);
if (pEpFile->fCaching)
rc = pdmacFileEpCacheWrite(pEpFile, (PPDMASYNCCOMPLETIONTASKFILE)pTask,
off, paSegments, cSegments, cbWrite);
else
rc = pdmacFileEpTaskInitiate(pTask, pEndpoint, off, paSegments, cSegments, cbWrite,
PDMACTASKFILETRANSFER_WRITE);
STAM_PROFILE_ADV_STOP(&pEpFile->StatWrite, Write);
/* Increase endpoint size. */
if ( RT_SUCCESS(rc)
&& ((uint64_t)off + cbWrite) > pEpFile->cbEndpoint)
ASMAtomicWriteU64(&pEpFile->cbEndpoint, (uint64_t)off + cbWrite);
return rc;
}
static int pdmacFileEpFlush(PPDMASYNCCOMPLETIONTASK pTask,
PPDMASYNCCOMPLETIONENDPOINT pEndpoint)
{
int rc = VINF_SUCCESS;
PPDMASYNCCOMPLETIONENDPOINTFILE pEpFile = (PPDMASYNCCOMPLETIONENDPOINTFILE)pEndpoint;
PPDMASYNCCOMPLETIONTASKFILE pTaskFile = (PPDMASYNCCOMPLETIONTASKFILE)pTask;
if (RT_UNLIKELY(pEpFile->fReadonly))
return VERR_NOT_SUPPORTED;
pTaskFile->cbTransferLeft = 0;
pTaskFile->rc = VINF_SUCCESS;
if (pEpFile->fCaching)
rc = pdmacFileEpCacheFlush(pEpFile, pTaskFile);
else
{
PPDMACTASKFILE pIoTask = pdmacFileTaskAlloc(pEpFile);
AssertPtr(pIoTask);
pIoTask->pEndpoint = pEpFile;
pIoTask->enmTransferType = PDMACTASKFILETRANSFER_FLUSH;
pIoTask->pvUser = pTaskFile;
pIoTask->pfnCompleted = pdmacFileEpTaskCompleted;
pdmacFileEpAddTask(pEpFile, pIoTask);
rc = VINF_AIO_TASK_PENDING;
}
return rc;
}
static int pdmacFileEpGetSize(PPDMASYNCCOMPLETIONENDPOINT pEndpoint, uint64_t *pcbSize)
{
PPDMASYNCCOMPLETIONENDPOINTFILE pEpFile = (PPDMASYNCCOMPLETIONENDPOINTFILE)pEndpoint;
*pcbSize = ASMAtomicReadU64(&pEpFile->cbEndpoint);
return VINF_SUCCESS;
}
static int pdmacFileEpSetSize(PPDMASYNCCOMPLETIONENDPOINT pEndpoint, uint64_t cbSize)
{
PPDMASYNCCOMPLETIONENDPOINTFILE pEpFile = (PPDMASYNCCOMPLETIONENDPOINTFILE)pEndpoint;
ASMAtomicWriteU64(&pEpFile->cbEndpoint, cbSize);
return RTFileSetSize(pEpFile->File, cbSize);
}
const PDMASYNCCOMPLETIONEPCLASSOPS g_PDMAsyncCompletionEndpointClassFile =
{
/* u32Version */
PDMAC_EPCLASS_OPS_VERSION,
/* pcszName */
"File",
/* enmClassType */
PDMASYNCCOMPLETIONEPCLASSTYPE_FILE,
/* cbEndpointClassGlobal */
sizeof(PDMASYNCCOMPLETIONEPCLASSFILE),
/* cbEndpoint */
sizeof(PDMASYNCCOMPLETIONENDPOINTFILE),
/* cbTask */
sizeof(PDMASYNCCOMPLETIONTASKFILE),
/* pfnInitialize */
pdmacFileInitialize,
/* pfnTerminate */
pdmacFileTerminate,
/* pfnEpInitialize. */
pdmacFileEpInitialize,
/* pfnEpClose */
pdmacFileEpClose,
/* pfnEpRead */
pdmacFileEpRead,
/* pfnEpWrite */
pdmacFileEpWrite,
/* pfnEpFlush */
pdmacFileEpFlush,
/* pfnEpGetSize */
pdmacFileEpGetSize,
/* pfnEpSetSize */
pdmacFileEpSetSize,
/* u32VersionEnd */
PDMAC_EPCLASS_OPS_VERSION
};