PDMAsyncCompletionFileNormal.cpp revision 22ec733a5e041fcdfe02fce2eafc9faf8b0077dd
/* $Id$ */
/** @file
* PDM Async I/O - Transport data asynchronous in R3 using EMT.
* Async File I/O manager.
*/
/*
* Copyright (C) 2006-2008 Sun Microsystems, Inc.
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
* Clara, CA 95054 USA or visit http://www.sun.com if you need
* additional information or have any questions.
*/
#include "PDMAsyncCompletionFileInternal.h"
/** The update period for the I/O load statistics in ms. */
#define PDMACEPFILEMGR_LOAD_UPDATE_PERIOD 1000
/** Maximum number of requests a manager will handle. */
/*******************************************************************************
* Internal functions *
*******************************************************************************/
{
int rc = VINF_SUCCESS;
if (rc == VERR_OUT_OF_RANGE)
if (RT_SUCCESS(rc))
{
/* Initialize request handle array. */
pAioMgr->iFreeEntryNext = 0;
pAioMgr->iFreeReqNext = 0;
if (pAioMgr->pahReqsFree)
{
/* Create the range lock memcache. */
if (RT_SUCCESS(rc))
return VINF_SUCCESS;
}
else
{
rc = VERR_NO_MEMORY;
}
}
return rc;
}
{
{
}
}
/**
* Sorts the endpoint list with insertion sort.
*/
{
while (pEpCurr)
{
/* Remember the next element to sort because the list might change. */
/* Unlink the current element from the list. */
if (pPrev)
else
if (pNext)
/* Go back until we reached the place to insert the current endpoint into. */
/* Link the endpoint into the list. */
if (pEpPrev)
else
if (pNext)
if (pEpPrev)
else
}
#ifdef DEBUG
/* Validate sorting alogrithm */
unsigned cEndpoints = 0;
AssertMsg(!pEpCurr->AioMgr.pEndpointPrev, ("First element in the list points to previous element\n"));
while (pEpCurr)
{
cEndpoints++;
}
#endif
}
/**
* Removes an endpoint from the currently assigned manager.
*
* @returns TRUE if there are still requests pending on the current manager for this endpoint.
* FALSE otherwise.
* @param pEndpointRemove The endpoint to remove.
*/
{
pAioMgr->cEndpoints--;
if (pPrev)
else
if (pNext)
/* Make sure that there is no request pending on this manager for the endpoint. */
{
/* Reopen the file so that the new endpoint can reassociate with the file */
return false;
}
return true;
}
{
/* Balancing doesn't make sense with only one endpoint. */
return false;
/* Doesn't make sens to move endpoints if only one produces the whole load */
unsigned cEndpointsWithLoad = 0;
while (pCurr)
{
}
return (cEndpointsWithLoad > 1);
}
/**
* Creates a new I/O manager and spreads the I/O load of the endpoints
* between the given I/O manager and the new one.
*
* @returns nothing.
* @param pAioMgr The I/O manager with high I/O load.
*/
{
int rc = VINF_SUCCESS;
/*
* Check if balancing would improve the situation.
*/
{
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pAioMgr->pEndpointsHead->Core.pEpClass;
if (RT_SUCCESS(rc))
{
/* We will sort the list by request count per second. */
/* Now move some endpoints to the new manager. */
unsigned cReqsOther = 0;
while (pCurr)
{
if (cReqsHere <= cReqsOther)
{
/*
* The other manager has more requests to handle now.
* We will keep the current endpoint.
*/
}
else
{
/* Move to other endpoint. */
Log(("Moving endpoint %#p{%s} with %u reqs/s to other manager\n", pCurr, pCurr->Core.pszUri, pCurr->AioMgr.cReqsPerSec));
if (fReqsPending)
{
}
else
{
}
}
}
}
else
{
/* Don't process further but leave a log entry about reduced performance. */
}
}
else
Log(("AIOMgr: Load balancing would not improve anything\n"));
}
/**
* Error handler which will create the failsafe managers and destroy the failed I/O manager.
*
* @returns VBox status code
* @param pAioMgr The I/O manager the error ocurred on.
* @param rc The error code.
*/
{
LogRel(("AIOMgr: I/O manager %#p encountered a critical error (rc=%Rrc) during operation. Falling back to failsafe mode. Expect reduced performance\n",
LogRel(("AIOMgr: Please contact the product vendor\n"));
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pAioMgr->pEndpointsHead->Core.pEpClass;
ASMAtomicWriteU32((volatile uint32_t *)&pEpClassFile->enmMgrTypeOverride, PDMACEPFILEMGRTYPE_SIMPLE);
AssertMsgFailed(("Implement\n"));
return VINF_SUCCESS;
}
/**
* Put a list of tasks in the pending request list of an endpoint.
*/
DECLINLINE(void) pdmacFileAioMgrEpAddTaskList(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMACTASKFILE pTaskHead)
{
/* Add the rest of the tasks to the pending list */
{
}
else
{
}
/* Update the tail. */
}
/**
* Put one task in the pending request list of an endpoint.
*/
DECLINLINE(void) pdmacFileAioMgrEpAddTask(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMACTASKFILE pTask)
{
/* Add the rest of the tasks to the pending list */
{
}
else
{
}
}
/**
* Wrapper around RTFIleAioCtxSubmit() which is also doing error handling.
*/
{
int rc;
LogFlow(("Enqueuing %d requests. I/O manager has a total of %d active requests now\n", cReqs, pAioMgr->cRequestsActive));
if (RT_FAILURE(rc))
{
{
/*
* We run out of resources.
* Need to check which requests got queued
* and put the rest on the pending list again.
*/
{
pEpClass->fOutOfResourcesWarningPrinted = true;
LogRel(("AIOMgr: The operating system doesn't have enough resources "
"to handle the I/O load of the VM. Expect reduced I/O performance\n"));
}
{
if (rcReq != VERR_FILE_AIO_IN_PROGRESS)
{
("Request returned unexpected return code: rc=%Rrc\n", rcReq));
/* Put the entry on the free array */
}
}
LogFlow(("Removed requests. I/O manager has a total of %d active requests now\n", pAioMgr->cRequestsActive));
}
else
}
return rc;
}
/**
* Allocates a async I/O request.
*
* @returns Handle to the request.
* @param pAioMgr The I/O manager.
*/
{
/* Get a request handle. */
{
}
else
{
}
return hReq;
}
{
pRangeLock = (PPDMACFILERANGELOCK)RTAvlrFileOffsetGet(pEndpoint->AioMgr.pTreeRangesLocked, offStart);
if (!pRangeLock)
{
pRangeLock = (PPDMACFILERANGELOCK)RTAvlrFileOffsetGetBestFit(pEndpoint->AioMgr.pTreeRangesLocked, offStart, true);
/* Check if we intersect with the range. */
if ( !pRangeLock
{
}
}
/* Check whether we have one of the situations explained below */
if ( pRangeLock
#if 0 /** @todo: later. For now we will just block all requests if they interfere */
|| (!pRangeLock->fReadLock)
#endif
)
{
/* Add to the list. */
if (!pRangeLock->pWaitingTasksHead)
{
}
else
{
}
return true;
}
return false;
}
{
("Range is already locked offStart=%RTfoff cbRange=%u\n",
PPDMACFILERANGELOCK pRangeLock = (PPDMACFILERANGELOCK)RTMemCacheAlloc(pAioMgr->hMemCacheRangeLocks);
if (!pRangeLock)
return VERR_NO_MEMORY;
/* Init the lock. */
/* Let the task point to its lock. */
return VINF_SUCCESS;
}
{
}
{
int rc = VINF_SUCCESS;
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pEndpoint->Core.pEpClass;
("Read exceeds file size offStart=%RTfoff cbToTransfer=%d cbFile=%llu\n",
pTask->fBounceBuffer = false;
/*
* Before we start to setup the request we have to check whether there is a task
* already active which range intersects with ours. We have to defer execution
* of this task in two cases:
* - The pending task is a write and the current is either read or write
* - The pending task is a read and the current task is a write task.
*
* To check whether a range is currently "locked" we use the AVL tree where every pending task
* is stored by its file offset range. The current task will be added to the active task
* and will be executed when the active one completes. (The method below
* which checks whether a range is already used will add the task)
*
* This is neccessary because of the requirement to align all requests to a 512 boundary
* which is enforced by the host OS (Linux and Windows atm). It is possible that
* we have to process unaligned tasks and need to align them using bounce buffers.
* While the data is fetched from the file another request might arrive writing to
* the same range. This will result in data corruption if both are executed concurrently.
*/
bool fLocked = pdmacFileAioMgrNormalIsRangeLocked(pEndpoint, pTask->Off, pTask->DataSeg.cbSeg, pTask);
if (!fLocked)
{
/* Get a request handle. */
{
/* Grow the file if needed. */
{
}
}
else
pTask);
if (RT_SUCCESS(rc))
}
else
return rc;
}
{
int rc = VINF_SUCCESS;
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pEndpoint->Core.pEpClass;
/*
* Check if the alignment requirements are met.
* Offset, transfer size and buffer address
* need to be on a 512 boundary.
*/
("Read exceeds file size offStart=%RTfoff cbToTransfer=%d cbFile=%llu\n",
/*
* Before we start to setup the request we have to check whether there is a task
* already active which range intersects with ours. We have to defer execution
* of this task in two cases:
* - The pending task is a write and the current is either read or write
* - The pending task is a read and the current task is a write task.
*
* To check whether a range is currently "locked" we use the AVL tree where every pending task
* is stored by its file offset range. The current task will be added to the active task
* and will be executed when the active one completes. (The method below
* which checks whether a range is already used will add the task)
*
* This is neccessary because of the requirement to align all requests to a 512 boundary
* which is enforced by the host OS (Linux and Windows atm). It is possible that
* we have to process unaligned tasks and need to align them using bounce buffers.
* While the data is fetched from the file another request might arrive writing to
* the same range. This will result in data corruption if both are executed concurrently.
*/
if (!fLocked)
{
/* Get a request handle. */
{
LogFlow(("Using bounce buffer for task %#p cbToTransfer=%zd cbSeg=%zd offStart=%RTfoff off=%RTfoff\n",
/* Create bounce buffer. */
pTask->fBounceBuffer = true;
/** @todo: I think we need something like a RTMemAllocAligned method here.
* Current assumption is that the maximum alignment is 4096byte
* (GPT disk on Windows)
* so we can use RTMemPageAlloc here.
*/
{
{
{
/* We have to fill the buffer first before we can update the data. */
}
else
}
}
else
rc = VERR_NO_MEMORY;
}
else
pTask->fBounceBuffer = false;
if (RT_SUCCESS(rc))
{
("AIO: Alignment restrictions not met! pvBuf=%p uBitmaskAlignment=%p\n", pvBuf, pEpClassFile->uBitmaskAlignment));
{
/* Grow the file if needed. */
{
}
}
else
if (RT_SUCCESS(rc))
else
{
/* Cleanup */
if (pTask->fBounceBuffer)
}
}
}
else
return rc;
}
{
unsigned cRequests = 0;
int rc = VINF_SUCCESS;
("Trying to process request lists of a non active endpoint!\n"));
/* Go through the list and queue the requests until we get a flush request */
while ( pTaskHead
&& (cMaxRequests > 0)
&& RT_SUCCESS(rc))
{
{
pAioMgr->fBwLimitReached = true;
break;
}
("Endpoints do not match\n"));
switch (pCurr->enmTransferType)
{
{
/* If there is no data transfer request this flush request finished immediately. */
{
}
else
{
}
break;
}
{
else
if (hReq != NIL_RTFILEAIOREQ)
{
cMaxRequests--;
cRequests++;
{
cRequests = 0;
("Unexpected return code\n"));
}
}
break;
}
default:
}
}
if (cRequests)
{
("Unexpected return code rc=%Rrc\n", rc));
}
if (pTaskHead)
{
/* Add the rest of the tasks to the pending list */
if (RT_UNLIKELY( !cMaxRequests
&& !pAioMgr->fBwLimitReached))
{
/*
* The I/O manager has no room left for more requests
* but there are still requests to process.
* Create a new I/O manager and let it handle some endpoints.
*/
}
}
/* Insufficient resources are not fatal. */
rc = VINF_SUCCESS;
return rc;
}
/**
* Adds all pending requests for the given endpoint
* until a flush request is encountered or there is no
* request anymore.
*
* @returns VBox status code.
* @param pAioMgr The async I/O manager for the endpoint
* @param pEndpoint The endpoint to get the requests from.
*/
{
int rc = VINF_SUCCESS;
("Trying to process request lists of a non active endpoint!\n"));
/* Check the pending list first */
{
LogFlow(("Queuing pending requests first\n"));
/*
* Clear the list as the processing routine will insert them into the list
* again if it gets a flush request.
*/
}
{
/* Now the request queue. */
if (pTasksHead)
{
}
}
return rc;
}
{
int rc = VINF_SUCCESS;
bool fNotifyWaiter = false;
LogFlowFunc((": Enter\n"));
switch (pAioMgr->enmBlockingEvent)
{
{
PPDMASYNCCOMPLETIONENDPOINTFILE pEndpointNew = (PPDMASYNCCOMPLETIONENDPOINTFILE)ASMAtomicReadPtr((void * volatile *)&pAioMgr->BlockingEventData.AddEndpoint.pEndpoint);
if (pAioMgr->pEndpointsHead)
/* Assign the completion point to this file. */
fNotifyWaiter = true;
pAioMgr->cEndpoints++;
break;
}
{
PPDMASYNCCOMPLETIONENDPOINTFILE pEndpointRemove = (PPDMASYNCCOMPLETIONENDPOINTFILE)ASMAtomicReadPtr((void * volatile *)&pAioMgr->BlockingEventData.RemoveEndpoint.pEndpoint);
break;
}
{
PPDMASYNCCOMPLETIONENDPOINTFILE pEndpointClose = (PPDMASYNCCOMPLETIONENDPOINTFILE)ASMAtomicReadPtr((void * volatile *)&pAioMgr->BlockingEventData.CloseEndpoint.pEndpoint);
{
/* Make sure all tasks finished. Process the queues a last time first. */
}
fNotifyWaiter = true;
break;
}
{
if (!pAioMgr->cRequestsActive)
fNotifyWaiter = true;
break;
}
{
break;
}
{
fNotifyWaiter = true;
break;
}
default:
}
if (fNotifyWaiter)
{
/* Release the waiting thread. */
LogFlow(("Signalling waiter\n"));
}
LogFlowFunc((": Leave\n"));
return rc;
}
/**
* Checks all endpoints for pending events or new requests.
*
* @returns VBox status code.
* @param pAioMgr The I/O manager handle.
*/
{
/* Check the assigned endpoints for new tasks if there isn't a flush request active at the moment. */
int rc = VINF_SUCCESS;
pAioMgr->fBwLimitReached = false;
while (pEndpoint)
{
{
if (RT_FAILURE(rc))
return rc;
}
{
/* Reopen the file so that the new endpoint can reassociate with the file */
{
}
else
{
/* Release the waiting thread. */
LogFlow(("Signalling waiter\n"));
}
}
}
return rc;
}
{
int rc = VINF_SUCCESS;
size_t cbTransfered = 0;
/*
* It is possible that the request failed on Linux with kernels < 2.6.23
* if the passed buffer was allocated with remap_pfn_range or if the file
* is on an NFS endpoint which does not support async and direct I/O at the same time.
* The endpoint will be migrated to a failsafe manager in case a request fails.
*/
if (RT_FAILURE(rcReq))
{
/* Free bounce buffers and the IPRT request. */
/* Free the lock and process pending tasks if neccessary */
if (pTask->fBounceBuffer)
/* Queue the request on the pending list. */
/* Create a new failsafe manager if neccessary. */
{
LogRel(("%s: Request %#p failed with rc=%Rrc, migrating endpoint %s to failsafe manager.\n",
/* Update the flags to open the file with. Disable async I/O and enable the host cache. */
}
/* If this was the last request for the endpoint migrate it to the new manager. */
{
}
}
else
{
("Task didn't completed successfully (rc=%Rrc) or was incomplete (cbTransfered=%u)\n", rcReq, cbTransfered));
{
/* Write it now. */
/* Grow the file if needed. */
{
}
}
else
{
{
}
/* Put the entry on the free array */
/* Free the lock and process pending tasks if neccessary */
/* Call completion callback */
/*
* If there is no request left on the endpoint but a flush request is set
* it completed now and we notify the owner.
* Furthermore we look for new requests and continue.
*/
{
/* Call completion callback */
AssertMsg(pTask->pEndpoint == pEndpoint, ("Endpoint of the flush request does not match assigned one\n"));
}
{
/* If the endpoint is about to be migrated do it now. */
}
}
} /* request completed successfully */
}
/** Helper macro for checking for error codes. */
if (RT_FAILURE(rc)) \
{\
return rc2;\
}
/**
* The normal I/O manager using the RTFileAio* API
*
* @returns VBox status code.
* @param ThreadSelf Handle of the thread.
* @param pvUser Opaque user data.
*/
{
int rc = VINF_SUCCESS;
{
LogFlow(("Got woken up\n"));
/* Check for an external blocking event first. */
if (pAioMgr->fBlockingEventPending)
{
}
{
/* We got woken up because an endpoint issued new requests. Queue them. */
while ( pAioMgr->cRequestsActive
|| pAioMgr->fBwLimitReached)
{
if (pAioMgr->cRequestsActive)
{
uint32_t cReqsCompleted = 0;
else
for (uint32_t i = 0; i < cReqsCompleted; i++)
/* Check for an external blocking event before we go to sleep again. */
if (pAioMgr->fBlockingEventPending)
{
}
/* Update load statistics. */
if (uMillisCurr > uMillisEnd)
{
/* Calculate timespan. */
while (pEndpointCurr)
{
pEndpointCurr->AioMgr.cReqsPerSec = pEndpointCurr->AioMgr.cReqsProcessed / (uMillisCurr + PDMACEPFILEMGR_LOAD_UPDATE_PERIOD);
}
/* Set new update interval */
}
}
else
{
/*
* Bandwidth limit reached for all endpoints.
* Yield and wait until we have enough resources again.
*/
}
/* Check endpoints for new requests. */
} /* while requests are active. */
} /* if still running */
} /* while running */
return rc;
}