PDMAsyncCompletionFileCache.cpp revision 2640b304447a3353a161e04ea294b8b66e18dc5d
/* $Id$ */
/** @file
* PDM Async I/O - Transport data asynchronous in R3 using EMT.
* File data cache.
*/
/*
* Copyright (C) 2006-2008 Sun Microsystems, Inc.
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
* Clara, CA 95054 USA or visit http://www.sun.com if you need
* additional information or have any questions.
*/
/** @page pg_pdm_async_completion_cache PDM Async Completion Cache - The file I/O cache
* This component implements an I/O cache for file endpoints based on the 2Q cache algorithm.
*/
/*******************************************************************************
* Header Files *
*******************************************************************************/
#include "PDMAsyncCompletionFileInternal.h"
/**
* A I/O memory context.
*/
typedef struct PDMIOMEMCTX
{
/** Number of segments. */
/** Current segment we are in. */
unsigned iSegIdx;
/** Pointer to the current buffer. */
/** Number of bytes left in the current buffer. */
} PDMIOMEMCTX, *PPDMIOMEMCTX;
#ifdef VBOX_STRICT
# define PDMACFILECACHE_IS_CRITSECT_OWNER(Cache) \
do \
{ \
("Thread does not own critical section\n"));\
} while(0);
#else
# define PDMACFILECACHE_IS_CRITSECT_OWNER(Cache) do { } while(0);
#endif
/*******************************************************************************
* Internal Functions *
*******************************************************************************/
/**
* Decrement the reference counter of the given cache entry.
*
* @returns nothing.
* @param pEntry The entry to release.
*/
{
}
/**
* Increment the reference counter of the given cache entry.
*
* @returns nothing.
* @param pEntry The entry to reference.
*/
{
}
/**
* Initialize a I/O memory context.
*
* @returns nothing
* @param pIoMemCtx Pointer to a unitialized I/O memory context.
* @param paDataSeg Pointer to the S/G list.
* @param cSegments Number of segments in the S/G list.
*/
{
AssertMsg((cSegments > 0) && paDataSeg, ("Trying to initialize a I/O memory context without a S/G list\n"));
}
/**
* Return a buffer from the I/O memory context.
*
* @returns Pointer to the buffer
* @param pIoMemCtx Pointer to the I/O memory context.
* @param pcbData Pointer to the amount of byte requested.
* If the current buffer doesn't have enough bytes left
* the amount is returned in the variable.
*/
{
/* Advance to the next segment if required. */
{
{
}
else
{
}
}
else
return pbBuf;
}
#ifdef DEBUG
{
/* Amount of cached data should never exceed the maximum amount. */
("Current amount of cached data exceeds maximum\n"));
/* The amount of cached data in the LRU and FRU list should match cbCached */
AssertMsg(pCache->LruRecentlyUsedIn.cbCached + pCache->LruFrequentlyUsed.cbCached == pCache->cbCached,
("Amount of cached data doesn't match\n"));
("Paged out list exceeds maximum\n"));
}
#endif
{
#ifdef DEBUG
#endif
}
{
#ifdef DEBUG
#endif
}
{
}
{
}
{
}
{
}
/**
* Checks consistency of a LRU list.
*
* @returns nothing
* @param pList The LRU list to check.
* @param pNotInList Element which is not allowed to occur in the list.
*/
{
/* Check that there are no double entries and no cycles in the list. */
while (pCurr)
{
while (pNext)
{
("Entry %#p is at least two times in list %#p or there is a cycle in the list\n",
}
}
}
#endif
/**
* Unlinks a cache entry from the LRU list it is assigned to.
*
* @returns nothing.
* @param pEntry The entry to unlink.
*/
{
#endif
if (pPrev)
else
{
if (pNext)
}
if (pNext)
else
{
if (pPrev)
}
#endif
}
/**
* Adds a cache entry to the given LRU list unlinking it from the currently
* assigned list if needed.
*
* @returns nothing.
* @param pList List to the add entry to.
* @param pEntry Entry to add.
*/
{
#endif
/* Remove from old list if needed */
else
{
}
#endif
}
/**
* Destroys a LRU list freeing all entries.
*
* @returns nothing
* @param pList Pointer to the LRU list to destroy.
*
* @note The caller must own the critical section of the cache.
*/
{
{
AssertMsg(!(pEntry->fFlags & (PDMACFILECACHE_ENTRY_IO_IN_PROGRESS | PDMACFILECACHE_ENTRY_IS_DIRTY)),
}
}
/**
* Tries to remove the given amount of bytes from a given list in the cache
* moving the entries to one of the given ghosts lists
*
* @returns Amount of data which could be freed.
* @param pCache Pointer to the global cache data.
* @param cbData The amount of the data to free.
* @param pListSrc The source list to evict data from.
* @param pGhostListSrc The ghost list removed entries should be moved to
* NULL if the entry should be freed.
* @param fReuseBuffer Flag whether a buffer should be reused if it has the same size
* @param ppbBuf Where to store the address of the buffer if an entry with the
* same size was found and fReuseBuffer is true.
*
* @note This function may return fewer bytes than requested because entries
* may be marked as non evictable if they are used for I/O at the
* moment.
*/
{
("Destination list must be NULL or the recently used but paged out list\n"));
if (fReuseBuffer)
{
}
/* Start deleting from the tail. */
{
/* We can't evict pages which are currently in progress */
{
/* Ok eviction candidate. Grab the endpoint semaphore and check again
* because somebody else might have raced us. */
{
("This entry is deprecated so it should have the I/O in progress flag set\n"));
{
}
if (pGhostListDst)
{
/* We have to remove the last entries from the paged out list. */
&& pGhostEntFree)
{
{
}
}
{
/* Couldn't remove enough entries. Delete */
}
else
}
else
{
/* Delete the entry from the AVL tree it is assigned to. */
}
}
}
else
LogFlow(("Entry %#p (%u bytes) is still in progress and can't be evicted\n", pCurr, pCurr->cbData));
}
return cbEvicted;
}
static bool pdmacFileCacheReclaim(PPDMACFILECACHEGLOBAL pCache, size_t cbData, bool fReuseBuffer, uint8_t **ppbBuffer)
{
return true;
{
/* Try to evict as many bytes as possible from A1in */
/*
* If it was not possible to remove enough entries
* try the frequently accessed cache.
*/
{
Assert(!fReuseBuffer || !*ppbBuffer); /* It is not possible that we got a buffer with the correct size but we didn't freed enough data. */
}
}
else
{
/* We have to remove entries from frequently access list. */
}
}
/**
* Initiates a read I/O task for the given entry.
*
* @returns nothing.
* @param pEntry The entry to fetch the data to.
*/
{
/* Make sure no one evicts the entry while it is accessed. */
/* Send it off to the I/O manager. */
}
/**
* Initiates a write I/O task for the given entry.
*
* @returns nothing.
* @param pEntry The entry to read the data from.
*/
{
/* Make sure no one evicts the entry while it is accessed. */
/* Send it off to the I/O manager. */
}
/**
* Completes a task segment freeing all ressources and completes the task handle
* if everything was transfered.
*
* @returns Next task segment handle.
* @param pEndpointCache The endpoint cache.
* @param pTaskSeg Task segment to complete.
*/
static PPDMACFILETASKSEG pdmacFileCacheTaskComplete(PPDMACFILEENDPOINTCACHE pEndpointCache, PPDMACFILETASKSEG pTaskSeg)
{
return pNext;
}
/**
* Completion callback for I/O tasks.
*
* @returns nothing.
* @param pTask The completed task.
* @param pvUser Opaque user data.
*/
{
/* Reference the entry now as we are clearing the I/O in progres flag
* which protects the entry till now. */
/* Process waiting segment list. The data in entry might have changed inbetween. */
("The list tail was not updated correctly\n"));
{
AssertMsg(pEndpointCache->cWritesOutstanding > 0, ("Completed write request but outstanding task count is 0\n"));
{
}
else
{
while (pCurr)
{
}
}
}
else
{
while (pCurr)
{
{
}
else
}
}
/* Complete a pending flush if all writes have completed */
{
PPDMASYNCCOMPLETIONTASKFILE pTaskFlush = (PPDMASYNCCOMPLETIONTASKFILE)ASMAtomicXchgPtr((void * volatile *)&pEndpointCache->pTaskFlush, NULL);
if (pTaskFlush)
}
/* Dereference so that it isn't protected anymore except we issued anyother write for it. */
}
/**
* Initializies the I/O cache.
*
* returns VBox status code.
* @param pClassFile The global class data for file endpoints.
* @param pCfgNode CFGM node to query configuration data from.
*/
{
int rc = VINF_SUCCESS;
/* Initialize members */
LogFlowFunc((": cbRecentlyUsedInMax=%u cbRecentlyUsedOutMax=%u\n", pCache->cbRecentlyUsedInMax, pCache->cbRecentlyUsedOutMax));
"/PDM/AsyncCompletion/File/cbMax",
"Maximum cache size");
"Currently used cache");
"Number of bytes cached in MRU list");
"Number of bytes cached in FRU list");
"Number of bytes cached in FRU ghost list");
#ifdef VBOX_WITH_STATISTICS
STAMUNIT_COUNT, "Number of hits in the cache");
STAMUNIT_COUNT, "Number of partial hits in the cache");
STAMUNIT_COUNT, "Number of misses when accessing the cache");
STAMUNIT_BYTES, "Number of bytes read from the cache");
STAMUNIT_BYTES, "Number of bytes written to the cache");
STAMUNIT_TICKS_PER_CALL, "Time taken to access an entry in the tree");
STAMUNIT_TICKS_PER_CALL, "Time taken to insert an entry in the tree");
STAMUNIT_TICKS_PER_CALL, "Time taken to remove an entry an the tree");
STAMUNIT_COUNT, "Number of times a buffer could be reused");
#endif
/* Initialize the critical section */
if (RT_SUCCESS(rc))
return rc;
}
/**
* Destroysthe cache freeing all data.
*
* returns nothing.
* @param pClassFile The global class data for file endpoints.
*/
{
/* Make sure no one else uses the cache now */
/* Cleanup deleting all cache entries waiting for in progress entries to finish. */
}
/**
* Initializes per endpoint cache data
* like the AVL tree used to access cached entries.
*
* @returns VBox status code.
* @param pEndpoint The endpoint to init the cache for,
* @param pClassFile The global class data for file endpoints.
*/
int pdmacFileEpCacheInit(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMASYNCCOMPLETIONEPCLASSFILE pClassFile)
{
if (RT_SUCCESS(rc))
{
if (!pEndpointCache->pTree)
{
rc = VERR_NO_MEMORY;
}
}
#ifdef VBOX_WITH_STATISTICS
if (RT_SUCCESS(rc))
{
STAMUNIT_COUNT, "Number of deferred writes",
}
#endif
return rc;
}
/**
* Callback for the AVL destroy routine. Frees a cache entry for this endpoint.
*
* @returns IPRT status code.
* @param pNode The node to destroy.
* @param pvUser Opaque user data.
*/
{
while (ASMAtomicReadU32(&pEntry->fFlags) & (PDMACFILECACHE_ENTRY_IO_IN_PROGRESS | PDMACFILECACHE_ENTRY_IS_DIRTY))
{
RTThreadSleep(250);
}
AssertMsg(!(pEntry->fFlags & (PDMACFILECACHE_ENTRY_IO_IN_PROGRESS | PDMACFILECACHE_ENTRY_IS_DIRTY)),
if (fUpdateCache)
return VINF_SUCCESS;
}
/**
* Destroys all cache ressources used by the given endpoint.
*
* @returns nothing.
* @param pEndpoint The endpoint to the destroy.
*/
{
/* Make sure nobody is accessing the cache while we delete the tree. */
#ifdef VBOX_WITH_STATISTICS
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pEndpoint->Core.pEpClass;
#endif
}
static PPDMACFILECACHEENTRY pdmacFileEpCacheGetCacheEntryByOffset(PPDMACFILEENDPOINTCACHE pEndpointCache, RTFOFF off)
{
if (pEntry)
return pEntry;
}
static PPDMACFILECACHEENTRY pdmacFileEpCacheGetCacheBestFitEntryByOffset(PPDMACFILEENDPOINTCACHE pEndpointCache, RTFOFF off)
{
pEntry = (PPDMACFILECACHEENTRY)RTAvlrFileOffsetGetBestFit(pEndpointCache->pTree, off, true /*fAbove*/);
if (pEntry)
return pEntry;
}
static void pdmacFileEpCacheInsertEntry(PPDMACFILEENDPOINTCACHE pEndpointCache, PPDMACFILECACHEENTRY pEntry)
{
}
/**
* Allocates and initializes a new entry for the cache.
* The entry has a reference count of 1.
*
* @returns Pointer to the new cache entry or NULL if out of memory.
* @param pCache The cache the entry belongs to.
* @param pEndoint The endpoint the entry holds data for.
* @param off Start offset.
* @param cbData Size of the cache entry.
* @param pbBuffer Pointer to the buffer to use.
* NULL if a new buffer should be allocated.
* The buffer needs to have the same size of the entry.
*/
{
if (RT_UNLIKELY(!pEntryNew))
return NULL;
if (pbBuffer)
else
{
return NULL;
}
return pEntryNew;
}
/**
* Adds a segment to the waiting list for a cache entry
* which is currently in progress.
*
* @returns nothing.
* @param pEntry The cache entry to add the segment to.
* @param pSeg The segment to add.
*/
DECLINLINE(void) pdmacFileEpCacheEntryAddWaitingSegment(PPDMACFILECACHEENTRY pEntry, PPDMACFILETASKSEG pSeg)
{
if (pEntry->pWaitingHead)
{
}
else
{
}
}
/**
* in exclusive mode.
*
* @returns true if the flag in fSet is set and the one in fClear is clear.
* false othwerise.
* The R/W semaphore is only held if true is returned.
*
* @param pEndpointCache The endpoint cache instance data.
* @param pEntry The entry to check the flags for.
* @param fSet The flag which is tested to be set.
* @param fClear The flag which is tested to be clear.
*/
DECLINLINE(bool) pdmacFileEpCacheEntryFlagIsSetClearAcquireLock(PPDMACFILEENDPOINTCACHE pEndpointCache,
{
if (fPassed)
{
/* Acquire the lock and check again becuase the completion callback might have raced us. */
/* Drop the lock if we didn't passed the test. */
if (!fPassed)
}
return fPassed;
}
/**
* Copies data to a buffer described by a I/O memory context.
*
* @returns nothing.
* @param pIoMemCtx The I/O memory context to copy the data into.
* @param pbData Pointer to the data data to copy.
* @param cbData Amount of data to copy.
*/
{
while (cbData)
{
}
}
/**
* Copies data from a buffer described by a I/O memory context.
*
* @returns nothing.
* @param pIoMemCtx The I/O memory context to copy the data from.
* @param pbData Pointer to the destination buffer.
* @param cbData Amount of data to copy.
*/
{
while (cbData)
{
}
}
/**
* Add a buffer described by the I/O memory context
* to the entry waiting for completion.
*
* @returns nothing.
* @param pEntry The entry to add the buffer to.
* @param pTask Task associated with the buffer.
* @param pIoMemCtx The memory context to use.
* @param OffDiff Offset from the start of the buffer
* in the entry.
* @param cbData Amount of data to wait for onthis entry.
* @param fWrite Flag whether the task waits because it wants to write
* to the cache entry.
*/
bool fWrite)
{
while (cbData)
{
}
}
/**
* Passthrough a part of a request directly to the I/O manager
* handling the endpoint.
*
* @returns nothing.
* @param pEndpoint The endpoint.
* @param pTask The task.
* @param pIoMemCtx The I/O memory context to use.
* @param offStart Offset to start transfer from.
* @param cbData Amount of data to transfer.
*/
{
while (cbData)
{
/* Send it off to the I/O manager. */
}
}
/**
* Reads the specified data from the endpoint using the cache if possible.
*
* @returns VBox status code.
* @param pEndpoint The endpoint to read from.
* @param pTask The task structure used as identifier for this request.
* @param off The offset to start reading from.
* @param paSegments Pointer to the array holding the destination buffers.
* @param cSegments Number of segments in the array.
* @param cbRead Number of bytes to read.
*/
int pdmacFileEpCacheRead(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMASYNCCOMPLETIONTASKFILE pTask,
{
int rc = VINF_SUCCESS;
LogFlowFunc((": pEndpoint=%#p{%s} pTask=%#p off=%RTfoff paSegments=%#p cSegments=%u cbRead=%u\n",
/* Set to completed to make sure that the task is valid while we access it. */
/* Init the I/O memory context */
while (cbRead)
{
/*
* If there is no entry we try to create a new one eviciting unused pages
* if the cache is full. If this is not possible we will pass the request through
* and skip the caching (all entries may be still in progress so they can't
* be evicted)
* If we have an entry it can be in one of the LRU lists where the entry
* contains data (recently used or frequently used LRU) so we can just read
* the data we need and put the entry at the head of the frequently used LRU list.
* In case the entry is in one of the ghost lists it doesn't contain any data.
* We have to fetch it again evicting pages from either T1 or T2 to make room.
*/
if (pEntry)
{
("Overflow in calculation off=%RTfoff OffsetAligned=%RTfoff\n",
("Buffer of cache entry exceeded off=%RTfoff cbToRead=%d\n",
if (!cbRead)
else
/* Ghost lists contain no data. */
{
0))
{
/* Entry is deprecated. Read data from the new buffer. */
}
else
{
{
/* Entry didn't completed yet. Append to the list */
&IoMemCtx,
false /* fWrite */);
}
else
{
/* Read as much as we can from the entry. */
}
}
/* Move this entry to the top position */
{
}
/* Release the entry */
}
else
{
pdmacFileCacheEntryRemoveFromList(pEntry); /* Remove it before we remove data, otherwise it may get freed when evicting data. */
/* Move the entry to Am and fetch it to the cache. */
if (fEnough)
{
if (pbBuffer)
else
&IoMemCtx,
false /* fWrite */);
/* Release the entry */
}
else
{
}
}
}
else
{
/* No entry found for this offset. Get best fit entry and fetch the data to the cache. */
PPDMACFILECACHEENTRY pEntryBestFit = pdmacFileEpCacheGetCacheBestFitEntryByOffset(pEndpointCache, off);
LogFlow(("%sbest fit entry for off=%RTfoff (BestFit=%RTfoff BestFitEnd=%RTfoff BestFitSize=%u)\n",
off,
if ( pEntryBestFit
{
}
else
{
/*
* Align the size to a 4KB boundary.
* Memory size is aligned to a page boundary
* and memory is wasted if the size is rahter small.
* (For example reads with a size of 512 bytes.
*/
/* Clip read to file size */
if (pEntryBestFit)
{
}
}
if (!cbRead)
else
if (fEnough)
{
PPDMACFILECACHEENTRY pEntryNew = pdmacFileCacheEntryAlloc(pCache, pEndpoint, off, cbToReadAligned, pbBuffer);
("Overflow in calculation off=%RTfoff OffsetAligned=%RTfoff\n",
false /* fWrite */);
}
else
{
/*
* There is not enough free space in the cache.
* Pass the request directly to the I/O manager.
*/
LogFlow(("Couldn't evict %u bytes from the cache. Remaining request will be passed through\n", cbToRead));
}
}
}
else
return rc;
}
/**
* Writes the given data to the endpoint using the cache if possible.
*
* @returns VBox status code.
* @param pEndpoint The endpoint to write to.
* @param pTask The task structure used as identifier for this request.
* @param off The offset to start writing to
* @param paSegments Pointer to the array holding the source buffers.
* @param cSegments Number of segments in the array.
* @param cbWrite Number of bytes to write.
*/
int pdmacFileEpCacheWrite(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMASYNCCOMPLETIONTASKFILE pTask,
{
int rc = VINF_SUCCESS;
LogFlowFunc((": pEndpoint=%#p{%s} pTask=%#p off=%RTfoff paSegments=%#p cSegments=%u cbWrite=%u\n",
/* Set to completed to make sure that the task is valid while we access it. */
/* Init the I/O memory context */
while (cbWrite)
{
if (pEntry)
{
/* Write the data into the entry and mark it as dirty */
("Overflow in calculation off=%RTfoff OffsetAligned=%RTfoff\n",
if (!cbWrite)
else
/* Ghost lists contain no data. */
{
/* Check if the buffer is deprecated. */
0))
{
("Entry is deprecated but not in progress\n"));
/* Update the data from the write. */
}
else /* Deprecated flag not set */
{
/* If the entry is dirty it must be also in progress now and we have to defer updating it again. */
0))
{
("Entry is dirty but not in progress\n"));
/* Deprecate the current buffer. */
if (!pEntry->pWaitingHead)
/* If we are out of memory or have waiting segments
* defer the write. */
{
/* The data isn't written to the file yet */
&IoMemCtx,
true /* fWrite */);
}
else /* Deprecate buffer */
{
/* Copy the data before the update. */
if (OffDiff)
/* Copy data behind the update. */
/* Update the data from the write. */
/* We are done here. A new write is initiated if the current request completes. */
}
}
else /* Dirty bit not set */
{
/*
* Check if a read is in progress for this entry.
* We have to defer processing in that case.
*/
0))
{
&IoMemCtx,
true /* fWrite */);
}
else /* I/O in progres flag not set */
{
/* Write as much as we can into the entry and update the file. */
}
} /* Dirty bit not set */
/* Move this entry to the top position */
{
} /* Deprecated flag not set. */
}
}
else /* Entry is on the ghost list */
{
pdmacFileCacheEntryRemoveFromList(pEntry); /* Remove it before we remove data, otherwise it may get freed when evicting data. */
if (fEnough)
{
/* Move the entry to Am and fetch it to the cache. */
if (pbBuffer)
else
&IoMemCtx,
true /* fWrite */);
/* Release the reference. If it is still needed the I/O in progress flag should protect it now. */
}
else
{
}
}
}
else /* No entry found */
{
/*
* No entry found. Try to create a new cache entry to store the data in and if that fails
* write directly to the file.
*/
PPDMACFILECACHEENTRY pEntryBestFit = pdmacFileEpCacheGetCacheBestFitEntryByOffset(pEndpointCache, off);
LogFlow(("%sest fit entry for off=%RTfoff (BestFit=%RTfoff BestFitEnd=%RTfoff BestFitSize=%u)\n",
off,
{
}
else
{
if (pEntryBestFit)
}
if (fEnough)
{
}
else
{
/*
* There is not enough free space in the cache.
* Pass the request directly to the I/O manager.
*/
LogFlow(("Couldn't evict %u bytes from the cache. Remaining request will be passed through\n", cbToWrite));
}
}
}
else
return rc;
}
int pdmacFileEpCacheFlush(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMASYNCCOMPLETIONTASKFILE pTask)
{
int rc = VINF_SUCCESS;
LogFlowFunc((": pEndpoint=%#p{%s} pTask=%#p\n",
else
{
{
}
else
}
return rc;
}