PDMAsyncCompletionFileNormal.cpp revision 10d739d22a5d5a13803f7e34de34de010099270c
/* $Id$ */
/** @file
* PDM Async I/O - Async File I/O manager.
*/
/*
* Copyright (C) 2006-2013 Oracle Corporation
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*/
/*******************************************************************************
* Header Files *
*******************************************************************************/
#include "PDMAsyncCompletionFileInternal.h"
/** The update period for the I/O load statistics in ms. */
#define PDMACEPFILEMGR_LOAD_UPDATE_PERIOD 1000
/** Maximum number of requests a manager will handle. */
#define PDMACEPFILEMGR_REQS_STEP 512
/*******************************************************************************
* Internal functions *
*******************************************************************************/
{
if (rc == VERR_OUT_OF_RANGE)
if (RT_SUCCESS(rc))
{
/* Initialize request handle array. */
pAioMgr->iFreeEntry = 0;
if (pAioMgr->pahReqsFree)
{
/* Create the range lock memcache. */
if (RT_SUCCESS(rc))
return VINF_SUCCESS;
}
else
{
rc = VERR_NO_MEMORY;
}
}
return rc;
}
{
while (pAioMgr->iFreeEntry > 0)
{
pAioMgr->iFreeEntry--;
}
}
#if 0 /* currently unused */
/**
* Sorts the endpoint list with insertion sort.
*/
{
while (pEpCurr)
{
/* Remember the next element to sort because the list might change. */
/* Unlink the current element from the list. */
if (pPrev)
else
if (pNext)
/* Go back until we reached the place to insert the current endpoint into. */
/* Link the endpoint into the list. */
if (pEpPrev)
else
if (pNext)
if (pEpPrev)
else
}
#ifdef DEBUG
/* Validate sorting algorithm */
unsigned cEndpoints = 0;
AssertMsg(!pEpCurr->AioMgr.pEndpointPrev, ("First element in the list points to previous element\n"));
while (pEpCurr)
{
cEndpoints++;
}
#endif
}
#endif /* currently unused */
/**
* Removes an endpoint from the currently assigned manager.
*
* @returns TRUE if there are still requests pending on the current manager for this endpoint.
* FALSE otherwise.
* @param pEndpointRemove The endpoint to remove.
*/
{
pAioMgr->cEndpoints--;
if (pPrev)
else
if (pNext)
/* Make sure that there is no request pending on this manager for the endpoint. */
{
/* Reopen the file so that the new endpoint can re-associate with the file */
int rc = RTFileOpen(&pEndpointRemove->hFile, pEndpointRemove->Core.pszUri, pEndpointRemove->fFlags);
return false;
}
return true;
}
#if 0 /* currently unused */
{
/* Balancing doesn't make sense with only one endpoint. */
return false;
/* Doesn't make sens to move endpoints if only one produces the whole load */
unsigned cEndpointsWithLoad = 0;
while (pCurr)
{
}
return (cEndpointsWithLoad > 1);
}
/**
* Creates a new I/O manager and spreads the I/O load of the endpoints
* between the given I/O manager and the new one.
*
* @returns nothing.
* @param pAioMgr The I/O manager with high I/O load.
*/
{
/*
* Check if balancing would improve the situation.
*/
{
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pAioMgr->pEndpointsHead->Core.pEpClass;
if (RT_SUCCESS(rc))
{
/* We will sort the list by request count per second. */
/* Now move some endpoints to the new manager. */
unsigned cReqsOther = 0;
while (pCurr)
{
if (cReqsHere <= cReqsOther)
{
/*
* The other manager has more requests to handle now.
* We will keep the current endpoint.
*/
}
else
{
/* Move to other endpoint. */
Log(("Moving endpoint %#p{%s} with %u reqs/s to other manager\n", pCurr, pCurr->Core.pszUri, pCurr->AioMgr.cReqsPerSec));
if (fReqsPending)
{
}
else
{
}
}
}
}
else
{
/* Don't process further but leave a log entry about reduced performance. */
}
}
else
Log(("AIOMgr: Load balancing would not improve anything\n"));
}
#endif /* unused */
/**
* Increase the maximum number of active requests for the given I/O manager.
*
* @returns VBox status code.
* @param pAioMgr The I/O manager to grow.
*/
{
&& !pAioMgr->cRequestsActive,
("Invalid state of the I/O manager\n"));
#ifdef RT_OS_WINDOWS
/*
* Reopen the files of all assigned endpoints first so we can assign them to the new
* I/O context.
*/
while (pCurr)
{
}
#endif
/* Create the new bigger context. */
if (rc == VERR_OUT_OF_RANGE)
if (RT_SUCCESS(rc))
{
/* Close the old context. */
/* Create a new I/O task handle array */
if (pahReqNew)
{
/* Copy the cached request handles. */
LogFlowFunc(("I/O manager increased to handle a maximum of %u requests\n",
}
else
rc = VERR_NO_MEMORY;
}
#ifdef RT_OS_WINDOWS
/* Assign the file to the new context. */
while (pCurr)
{
}
#endif
if (RT_FAILURE(rc))
{
}
return rc;
}
/**
* Checks if a given status code is fatal.
* Non fatal errors can be fixed by migrating the endpoint to a
* failsafe manager.
*
* @returns true If the error is fatal and migrating to a failsafe manager doesn't help
* false If the error can be fixed by a migration. (image on NFS disk for example)
* @param rcReq The status code to check.
*/
{
return rcReq == VERR_DEV_IO_ERROR
|| rcReq == VERR_FILE_IO_ERROR
|| rcReq == VERR_DISK_IO_ERROR
|| rcReq == VERR_DISK_FULL
|| rcReq == VERR_FILE_TOO_BIG;
}
/**
* Error handler which will create the failsafe managers and destroy the failed I/O manager.
*
* @returns VBox status code
* @param pAioMgr The I/O manager the error occurred on.
* @param rc The error code.
*/
{
LogRel(("AIOMgr: I/O manager %#p encountered a critical error (rc=%Rrc) during operation. Falling back to failsafe mode. Expect reduced performance\n",
LogRel(("AIOMgr: Please contact the product vendor\n"));
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pAioMgr->pEndpointsHead->Core.pEpClass;
ASMAtomicWriteU32((volatile uint32_t *)&pEpClassFile->enmMgrTypeOverride, PDMACEPFILEMGRTYPE_SIMPLE);
AssertMsgFailed(("Implement\n"));
return VINF_SUCCESS;
}
/**
* Put a list of tasks in the pending request list of an endpoint.
*/
DECLINLINE(void) pdmacFileAioMgrEpAddTaskList(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMACTASKFILE pTaskHead)
{
/* Add the rest of the tasks to the pending list */
{
}
else
{
}
/* Update the tail. */
}
/**
* Put one task in the pending request list of an endpoint.
*/
DECLINLINE(void) pdmacFileAioMgrEpAddTask(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMACTASKFILE pTask)
{
/* Add the rest of the tasks to the pending list */
{
}
else
{
}
}
/**
* Allocates a async I/O request.
*
* @returns Handle to the request.
* @param pAioMgr The I/O manager.
*/
{
/* Get a request handle. */
if (pAioMgr->iFreeEntry > 0)
{
pAioMgr->iFreeEntry--;
}
else
{
}
return hReq;
}
/**
* Frees a async I/O request handle.
*
* @returns nothing.
* @param pAioMgr The I/O manager.
* @param hReq The I/O request handle to free.
*/
{
pAioMgr->iFreeEntry++;
}
/**
* Wrapper around RTFIleAioCtxSubmit() which is also doing error handling.
*/
{
LogFlow(("Enqueuing %d requests. I/O manager has a total of %d active requests now\n", cReqs, pAioMgr->cRequestsActive));
if (RT_FAILURE(rc))
{
{
/* Append any not submitted task to the waiting list. */
{
if (rcReq != VERR_FILE_AIO_IN_PROGRESS)
{
{
/* Clear the pending flush */
}
}
}
/* Print an entry in the release log */
{
pEpClass->fOutOfResourcesWarningPrinted = true;
LogRel(("AIOMgr: Host limits number of active IO requests to %u. Expect a performance impact.\n",
}
LogFlow(("Removed requests. I/O manager has a total of %u active requests now\n", pAioMgr->cRequestsActive));
rc = VINF_SUCCESS;
}
else /* Another kind of error happened (full disk, ...) */
{
/* An error happened. Find out which one caused the error and resubmit all other tasks. */
{
if (rcReq == VERR_FILE_AIO_NOT_SUBMITTED)
{
/* We call ourself again to do any error handling which might come up now. */
}
else if (rcReq != VERR_FILE_AIO_IN_PROGRESS)
}
&& !pAioMgr->cRequestsActive
{
/*
* Complete a pending flush if we don't have requests enqueued and the host doesn't support
* the async flush API.
* Happens only if this we just noticed that this is not supported
* and the only active request was a flush.
*/
}
}
}
return VINF_SUCCESS;
}
{
/*
* If there is no unaligned request active and the current one is aligned
* just pass it through.
*/
return false;
pRangeLock = (PPDMACFILERANGELOCK)RTAvlrFileOffsetRangeGet(pEndpoint->AioMgr.pTreeRangesLocked, offStart);
if (!pRangeLock)
{
pRangeLock = (PPDMACFILERANGELOCK)RTAvlrFileOffsetGetBestFit(pEndpoint->AioMgr.pTreeRangesLocked, offStart, true);
/* Check if we intersect with the range. */
if ( !pRangeLock
{
}
}
/* Check whether we have one of the situations explained below */
if (pRangeLock)
{
/* Add to the list. */
if (!pRangeLock->pWaitingTasksHead)
{
}
else
{
}
return true;
}
return false;
}
{
LogFlowFunc(("pAioMgr=%#p pEndpoint=%#p offStart=%RTfoff cbRange=%zu pTask=%#p\n",
("Range is already locked offStart=%RTfoff cbRange=%u\n",
/*
* If there is no unaligned request active and the current one is aligned
* just don't use the lock.
*/
{
return VINF_SUCCESS;
}
PPDMACFILERANGELOCK pRangeLock = (PPDMACFILERANGELOCK)RTMemCacheAlloc(pAioMgr->hMemCacheRangeLocks);
if (!pRangeLock)
return VERR_NO_MEMORY;
/* Init the lock. */
/* Let the task point to its lock. */
return VINF_SUCCESS;
}
{
LogFlowFunc(("pAioMgr=%#p pEndpoint=%#p pRangeLock=%#p\n",
/* pRangeLock can be NULL if there was no lock assigned with the task. */
if (!pRangeLock)
return NULL;
return pTasksWaitingHead;
}
{
("Read exceeds file size offStart=%RTfoff cbToTransfer=%d cbFile=%llu\n",
pTask->cbBounceBuffer = 0;
/*
* Before we start to setup the request we have to check whether there is a task
* already active which range intersects with ours. We have to defer execution
* of this task in two cases:
* - The pending task is a write and the current is either read or write
* - The pending task is a read and the current task is a write task.
*
* To check whether a range is currently "locked" we use the AVL tree where every pending task
* is stored by its file offset range. The current task will be added to the active task
* and will be executed when the active one completes. (The method below
* which checks whether a range is already used will add the task)
*
* This is necessary because of the requirement to align all requests to a 512 boundary
* which is enforced by the host OS (Linux and Windows atm). It is possible that
* we have to process unaligned tasks and need to align them using bounce buffers.
* While the data is fetched from the file another request might arrive writing to
* the same range. This will result in data corruption if both are executed concurrently.
*/
int rc = VINF_SUCCESS;
bool fLocked = pdmacFileAioMgrNormalIsRangeLocked(pEndpoint, pTask->Off, pTask->DataSeg.cbSeg, pTask,
true /* fAlignedReq */);
if (!fLocked)
{
/* Get a request handle. */
{
/* Grow the file if needed. */
{
}
}
else
pTask, true /* fAlignedReq */);
if (RT_SUCCESS(rc))
{
}
}
else
return rc;
}
{
/*
* Check if the alignment requirements are met.
* Offset, transfer size and buffer address
* need to be on a 512 boundary.
*/
("Read exceeds file size offStart=%RTfoff cbToTransfer=%d cbFile=%llu\n",
/*
* Before we start to setup the request we have to check whether there is a task
* already active which range intersects with ours. We have to defer execution
* of this task in two cases:
* - The pending task is a write and the current is either read or write
* - The pending task is a read and the current task is a write task.
*
* To check whether a range is currently "locked" we use the AVL tree where every pending task
* is stored by its file offset range. The current task will be added to the active task
* and will be executed when the active one completes. (The method below
* which checks whether a range is already used will add the task)
*
* This is necessary because of the requirement to align all requests to a 512 boundary
* which is enforced by the host OS (Linux and Windows atm). It is possible that
* we have to process unaligned tasks and need to align them using bounce buffers.
* While the data is fetched from the file another request might arrive writing to
* the same range. This will result in data corruption if both are executed concurrently.
*/
int rc = VINF_SUCCESS;
bool fLocked = pdmacFileAioMgrNormalIsRangeLocked(pEndpoint, offStart, cbToTransfer, pTask, fAlignedReq);
if (!fLocked)
{
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pEndpoint->Core.pEpClass;
/* Get a request handle. */
if ( !fAlignedReq
{
LogFlow(("Using bounce buffer for task %#p cbToTransfer=%zd cbSeg=%zd offStart=%RTfoff off=%RTfoff\n",
/* Create bounce buffer. */
/** @todo: I think we need something like a RTMemAllocAligned method here.
* Current assumption is that the maximum alignment is 4096byte
* (GPT disk on Windows)
* so we can use RTMemPageAlloc here.
*/
{
{
{
/* We have to fill the buffer first before we can update the data. */
}
else
}
}
else
rc = VERR_NO_MEMORY;
}
else
pTask->cbBounceBuffer = 0;
if (RT_SUCCESS(rc))
{
("AIO: Alignment restrictions not met! pvBuf=%p uBitmaskAlignment=%p\n", pvBuf, pEpClassFile->uBitmaskAlignment));
{
/* Grow the file if needed. */
{
}
}
else
rc = pdmacFileAioMgrNormalRangeLock(pAioMgr, pEndpoint, offStart, cbToTransfer, pTask, fAlignedReq);
if (RT_SUCCESS(rc))
{
}
else
{
/* Cleanup */
if (pTask->cbBounceBuffer)
}
}
}
else
return rc;
}
{
unsigned cRequests = 0;
int rc = VINF_SUCCESS;
("Trying to process request lists of a non active endpoint!\n"));
/* Go through the list and queue the requests until we get a flush request */
while ( pTaskHead
&& RT_SUCCESS(rc))
{
{
break;
}
("Endpoints do not match\n"));
switch (pCurr->enmTransferType)
{
{
/* If there is no data transfer request this flush request finished immediately. */
{
/* Issue a flush to the host. */
if (RT_FAILURE(rc))
{
pEndpoint->fAsyncFlushSupported = false;
}
else
{
cRequests++;
}
}
{
}
else
{
}
break;
}
{
{
else
}
else
{
}
if (hReq != NIL_RTFILEAIOREQ)
{
cRequests++;
}
break;
}
default:
} /* switch transfer type */
/* Queue the requests if the array is full. */
{
cRequests = 0;
("Unexpected return code\n"));
}
}
if (cRequests)
{
("Unexpected return code rc=%Rrc\n", rc));
}
if (pTaskHead)
{
/* Add the rest of the tasks to the pending list */
{
#if 0
/*
* The I/O manager has no room left for more requests
* but there are still requests to process.
* Create a new I/O manager and let it handle some endpoints.
*/
#else
/* Grow the I/O manager */
#endif
}
}
/* Insufficient resources are not fatal. */
rc = VINF_SUCCESS;
return rc;
}
/**
* Adds all pending requests for the given endpoint
* until a flush request is encountered or there is no
* request anymore.
*
* @returns VBox status code.
* @param pAioMgr The async I/O manager for the endpoint
* @param pEndpoint The endpoint to get the requests from.
*/
{
int rc = VINF_SUCCESS;
("Trying to process request lists of a non active endpoint!\n"));
/* Check the pending list first */
{
LogFlow(("Queuing pending requests first\n"));
/*
* Clear the list as the processing routine will insert them into the list
* again if it gets a flush request.
*/
}
{
/* Now the request queue. */
if (pTasksHead)
{
}
}
return rc;
}
{
int rc = VINF_SUCCESS;
bool fNotifyWaiter = false;
LogFlowFunc((": Enter\n"));
switch (pAioMgr->enmBlockingEvent)
{
{
PPDMASYNCCOMPLETIONENDPOINTFILE pEndpointNew = ASMAtomicReadPtrT(&pAioMgr->BlockingEventData.AddEndpoint.pEndpoint, PPDMASYNCCOMPLETIONENDPOINTFILE);
if (pAioMgr->pEndpointsHead)
/* Assign the completion point to this file. */
fNotifyWaiter = true;
pAioMgr->cEndpoints++;
break;
}
{
PPDMASYNCCOMPLETIONENDPOINTFILE pEndpointRemove = ASMAtomicReadPtrT(&pAioMgr->BlockingEventData.RemoveEndpoint.pEndpoint, PPDMASYNCCOMPLETIONENDPOINTFILE);
break;
}
{
PPDMASYNCCOMPLETIONENDPOINTFILE pEndpointClose = ASMAtomicReadPtrT(&pAioMgr->BlockingEventData.CloseEndpoint.pEndpoint, PPDMASYNCCOMPLETIONENDPOINTFILE);
{
/* Make sure all tasks finished. Process the queues a last time first. */
}
fNotifyWaiter = true;
break;
}
{
if (!pAioMgr->cRequestsActive)
fNotifyWaiter = true;
break;
}
{
break;
}
{
fNotifyWaiter = true;
break;
}
default:
}
if (fNotifyWaiter)
{
/* Release the waiting thread. */
LogFlow(("Signalling waiter\n"));
}
LogFlowFunc((": Leave\n"));
return rc;
}
/**
* Checks all endpoints for pending events or new requests.
*
* @returns VBox status code.
* @param pAioMgr The I/O manager handle.
*/
{
/* Check the assigned endpoints for new tasks if there isn't a flush request active at the moment. */
int rc = VINF_SUCCESS;
while (pEndpoint)
{
{
if (RT_FAILURE(rc))
return rc;
}
{
/* Reopen the file so that the new endpoint can re-associate with the file */
{
}
else
{
/* Release the waiting thread. */
LogFlow(("Signalling waiter\n"));
}
}
}
return rc;
}
/**
* Wrapper around pdmacFileAioMgrNormalReqCompleteRc().
*/
{
size_t cbTransfered = 0;
}
{
int rc = VINF_SUCCESS;
/*
* It is possible that the request failed on Linux with kernels < 2.6.23
* if the passed buffer was allocated with remap_pfn_range or if the file
* is on an NFS endpoint which does not support async and direct I/O at the same time.
* The endpoint will be migrated to a failsafe manager in case a request fails.
*/
if (RT_FAILURE(rcReq))
{
/* Free bounce buffers and the IPRT request. */
{
pEndpoint->fAsyncFlushSupported = false;
/* The other method will take over now. */
/* Call completion callback */
}
else
{
/* Free the lock and process pending tasks if necessary */
if (pTask->cbBounceBuffer)
/*
* Fatal errors are reported to the guest and non-fatal errors
* will cause a migration to the failsafe manager in the hope
* that the error disappears.
*/
{
/* Queue the request on the pending list. */
/* Create a new failsafe manager if necessary. */
{
LogRel(("%s: Request %#p failed with rc=%Rrc, migrating endpoint %s to failsafe manager.\n",
/* Update the flags to open the file with. Disable async I/O and enable the host cache. */
}
/* If this was the last request for the endpoint migrate it to the new manager. */
{
}
}
else
{
}
}
}
else
{
{
/* Clear pending flush */
/* Call completion callback */
}
else
{
/*
* Restart an incomplete transfer.
* This usually means that the request will return an error now
* but to get the cause of the error (disk full, file too big, I/O error, ...)
* the transfer needs to be continued.
*/
|| ( pTask->cbBounceBuffer
{
LogFlow(("Restarting incomplete transfer %#p (%zu bytes transferred)\n",
pTask, cbTransfered));
if (pTask->cbBounceBuffer)
{
}
else
{
}
{
}
else
{
("Invalid transfer type\n"));
}
("Unexpected return code rc=%Rrc\n", rc));
}
{
/* Write it now. */
/* Grow the file if needed. */
{
}
("Unexpected return code rc=%Rrc\n", rc));
}
else
{
{
}
/* Free the lock and process pending tasks if necessary */
if (pTasksWaiting)
{
}
/* Call completion callback */
/*
* If there is no request left on the endpoint but a flush request is set
* it completed now and we notify the owner.
* Furthermore we look for new requests and continue.
*/
{
/* Call completion callback */
AssertMsg(pTask->pEndpoint == pEndpoint, ("Endpoint of the flush request does not match assigned one\n"));
}
{
/* If the endpoint is about to be migrated do it now. */
}
}
} /* Not a flush request */
} /* request completed successfully */
}
/** Helper macro for checking for error codes. */
if (RT_FAILURE(rc)) \
{\
return rc2;\
}
/**
* The normal I/O manager using the RTFileAio* API
*
* @returns VBox status code.
* @param hThreadSelf Handle of the thread.
* @param pvUser Opaque user data.
*/
{
int rc = VINF_SUCCESS;
{
if (!pAioMgr->cRequestsActive)
{
LogFlow(("Got woken up\n"));
}
/* Check for an external blocking event first. */
if (pAioMgr->fBlockingEventPending)
{
}
{
/* We got woken up because an endpoint issued new requests. Queue them. */
while (pAioMgr->cRequestsActive)
{
uint32_t cReqsCompleted = 0;
else
1,
for (uint32_t i = 0; i < cReqsCompleted; i++)
/* Check for an external blocking event before we go to sleep again. */
if (pAioMgr->fBlockingEventPending)
{
}
/* Update load statistics. */
if (uMillisCurr > uMillisEnd)
{
/* Calculate timespan. */
while (pEndpointCurr)
{
pEndpointCurr->AioMgr.cReqsPerSec = pEndpointCurr->AioMgr.cReqsProcessed / (uMillisCurr + PDMACEPFILEMGR_LOAD_UPDATE_PERIOD);
}
/* Set new update interval */
}
/* Check endpoints for new requests. */
{
}
} /* while requests are active. */
{
}
} /* if still running */
} /* while running */
return rc;
}