PDMAsyncCompletionFileCache.cpp revision ce03ea57fdcf3d48523b1de5b894feb75e1b34da
/* $Id$ */
/** @file
* PDM Async I/O - Transport data asynchronous in R3 using EMT.
* File data cache.
*/
/*
* Copyright (C) 2006-2008 Sun Microsystems, Inc.
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
* Clara, CA 95054 USA or visit http://www.sun.com if you need
* additional information or have any questions.
*/
/** @page pg_pdm_async_completion_cache PDM Async Completion Cache - The file I/O cache
* This component implements an I/O cache for file endpoints based on the ARC algorithm.
*
* The algorithm uses four LRU (Least frequently used) lists to store data in the cache.
* Two of them contain data where one stores entries which were accessed recently and one
* which is used for frequently accessed data.
* The other two lists are called ghost lists and store information about the accessed range
* but do not contain data. They are used to track data access. If these entries are accessed
* they will push the data to a higher position in the cache preventing it from getting removed
* quickly again.
*
* The algorithm needs to be modified to meet our requirements. Like the implementation
* for the ZFS filesystem we need to handle pages with a variable size. It would
* be possible to use a fixed size but would increase the computational
* and memory overhead.
* Because we do I/O asynchronously we also need to mark entries which are currently accessed
* as non evictable to prevent removal of the entry while the data is being accessed.
*/
/*******************************************************************************
* Header Files *
*******************************************************************************/
#include "PDMAsyncCompletionFileInternal.h"
#ifdef VBOX_STRICT
# define PDMACFILECACHE_IS_CRITSECT_OWNER(Cache) \
do \
{ \
("Thread does not own critical section\n"));\
} while(0);
#else
# define PDMACFILECACHE_IS_CRITSECT_OWNER(Cache) do { } while(0);
#endif
/*******************************************************************************
* Internal Functions *
*******************************************************************************/
{
}
{
}
/**
* Checks consistency of a LRU list.
*
* @returns nothing
* @param pList The LRU list to check.
* @param pNotInList Element which is not allowed to occur in the list.
*/
{
/* Check that there are no double entries and no cycles in the list. */
while (pCurr)
{
while (pNext)
{
("Entry %#p is at least two times in list %#p or there is a cycle in the list\n",
}
}
#endif
}
/**
* Unlinks a cache entry from the LRU list it is assigned to.
*
* @returns nothing.
* @param pEntry The entry to unlink.
*/
{
if (pPrev)
else
{
if (pNext)
}
if (pNext)
else
{
if (pPrev)
}
}
/**
* Adds a cache entry to the given LRU list unlinking it from the currently
* assigned list if needed.
*
* @returns nothing.
* @param pList List to the add entry to.
* @param pEntry Entry to add.
*/
{
/* Remove from old list if needed */
else
{
}
}
/**
* Destroys a LRU list freeing all entries.
*
* @returns nothing
* @param pList Pointer to the LRU list to destroy.
*
* @note The caller must own the critical section of the cache.
*/
{
{
AssertMsg(!(pEntry->fFlags & (PDMACFILECACHE_ENTRY_IO_IN_PROGRESS | PDMACFILECACHE_ENTRY_IS_DIRTY)),
}
}
/**
* Tries to remove the given amount of bytes from a given list in the cache
* moving the entries to one of the given ghosts lists
*
* @returns Amount of data which could be freed.
* @param pCache Pointer to the global cache data.
* @param cbData The amount of the data to free.
* @param pListSrc The source list to evict data from.
* @param pGhostListSrc The ghost list removed entries should be moved to
* NULL if the entry should be freed.
* @param fReuseBuffer Flag whether a buffer should be reused if it has the same size
* @param ppbBuf Where to store the address of the buffer if an entry with the
* same size was found and fReuseBuffer is true.
*
* @note This function may return fewer bytes than requested because entries
* may be marked as non evictable if they are used for I/O at the
* moment.
*/
{
#ifdef VBOX_WITH_2Q_CACHE
("Destination list must be NULL or the recently used but paged out list\n"));
#else
("Destination list must be NULL or one of the ghost lists\n"));
#endif
if (fReuseBuffer)
{
}
/* Start deleting from the tail. */
{
/* We can't evict pages which are currently in progress */
{
/* Ok eviction candidate. Grab the endpoint semaphore and check again
* because somebody else might have raced us. */
{
("This entry is deprecated so it should have the I/O in progress flag set\n"));
{
}
if (pGhostListDst)
{
#ifdef VBOX_WITH_2Q_CACHE
/* We have to remove the last entries from the paged out list. */
{
}
#endif
}
else
{
/* Delete the entry from the AVL tree it is assigned to. */
}
}
}
else
LogFlow(("Entry %#p (%u bytes) is still in progress and can't be evicted\n", pCurr, pCurr->cbData));
}
return cbEvicted;
}
#ifdef VBOX_WITH_2Q_CACHE
static bool pdmacFileCacheReclaim(PPDMACFILECACHEGLOBAL pCache, size_t cbData, bool fReuseBuffer, uint8_t **ppbBuffer)
{
return true;
{
/* Try to evict as many bytes as possible from A1in */
/*
* If it was not possible to remove enough entries
* try the frequently accessed cache.
*/
{
Assert(!fReuseBuffer || !*ppbBuffer); /* It is not possible that we got a buffer with the correct size but we didn't freed enough data. */
}
}
else
{
/* We have to remove entries from frequently access list. */
}
}
#else
static size_t pdmacFileCacheReplace(PPDMACFILECACHEGLOBAL pCache, size_t cbData, PPDMACFILELRULIST pEntryList,
{
{
/* We need to remove entry size pages from T1 and move the entries to B1 */
}
else
{
/* We need to remove entry size pages from T2 and move the entries to B2 */
}
}
/**
* Tries to evict the given amount of the data from the cache.
*
* @returns Bytes removed.
* @param pCache The global cache data.
* @param cbData Number of bytes to evict.
*/
static size_t pdmacFileCacheEvict(PPDMACFILECACHEGLOBAL pCache, size_t cbData, bool fReuseBuffer, uint8_t **ppbBuffer)
{
{
/* Delete desired pages from the cache. */
{
NULL,
}
else
{
NULL,
}
}
else
{
{
NULL,
}
}
return cbRemoved;
}
/**
* Updates the cache parameters
*
* @returns nothing.
* @param pCache The global cache data.
* @param pEntry The entry usign for the update.
*/
{
int32_t uUpdateVal = 0;
/* Update parameters */
{
uUpdateVal = 1;
else
}
{
uUpdateVal = 1;
else
}
else
AssertMsgFailed(("Invalid list type\n"));
}
#endif
/**
* Initiates a read I/O task for the given entry.
*
* @returns nothing.
* @param pEntry The entry to fetch the data to.
*/
{
/* Make sure no one evicts the entry while it is accessed. */
/* Send it off to the I/O manager. */
}
/**
* Initiates a write I/O task for the given entry.
*
* @returns nothing.
* @param pEntry The entry to read the data from.
*/
{
/* Make sure no one evicts the entry while it is accessed. */
/* Send it off to the I/O manager. */
}
/**
* Completes a task segment freeing all ressources and completes the task handle
* if everything was transfered.
*
* @returns Next task segment handle.
* @param pEndpointCache The endpoint cache.
* @param pTaskSeg Task segment to complete.
*/
static PPDMACFILETASKSEG pdmacFileCacheTaskComplete(PPDMACFILEENDPOINTCACHE pEndpointCache, PPDMACFILETASKSEG pTaskSeg)
{
return pNext;
}
/**
* Completion callback for I/O tasks.
*
* @returns nothing.
* @param pTask The completed task.
* @param pvUser Opaque user data.
*/
{
/* Reference the entry now as we are clearing the I/O in progres flag
* which protects the entry till now. */
/* Process waiting segment list. The data in entry might have changed inbetween. */
("The list tail was not updated correctly\n"));
{
AssertMsg(pEndpointCache->cWritesOutstanding > 0, ("Completed write request but outstanding task count is 0\n"));
{
}
else
{
while (pCurr)
{
}
}
}
else
{
while (pCurr)
{
{
}
else
}
}
/* Complete a pending flush if all writes have completed */
{
PPDMASYNCCOMPLETIONTASKFILE pTaskFlush = (PPDMASYNCCOMPLETIONTASKFILE)ASMAtomicXchgPtr((void * volatile *)&pEndpointCache->pTaskFlush, NULL);
if (pTaskFlush)
}
/* Dereference so that it isn't protected anymore except we issued anyother write for it. */
}
/**
* Initializies the I/O cache.
*
* returns VBox status code.
* @param pClassFile The global class data for file endpoints.
* @param pCfgNode CFGM node to query configuration data from.
*/
{
int rc = VINF_SUCCESS;
/* Initialize members */
#ifdef VBOX_WITH_2Q_CACHE
LogFlowFunc((": cbRecentlyUsedInMax=%u cbRecentlyUsedOutMax=%u\n", pCache->cbRecentlyUsedInMax, pCache->cbRecentlyUsedOutMax));
#else
#endif
"/PDM/AsyncCompletion/File/cbMax",
"Maximum cache size");
"Currently used cache");
#ifdef VBOX_WITH_2Q_CACHE
"Number of bytes cached in MRU list");
"Number of bytes cached in FRU list");
"Number of bytes cached in FRU ghost list");
#else
"Number of bytes cached in Mru list");
"Number of bytes cached in Fru list");
"Number of bytes cached in Mru ghost list");
STAMUNIT_BYTES, "Number of bytes cached in Fru ghost list");
#endif
#ifdef VBOX_WITH_STATISTICS
STAMUNIT_COUNT, "Number of hits in the cache");
STAMUNIT_COUNT, "Number of partial hits in the cache");
STAMUNIT_COUNT, "Number of misses when accessing the cache");
STAMUNIT_BYTES, "Number of bytes read from the cache");
STAMUNIT_BYTES, "Number of bytes written to the cache");
STAMUNIT_TICKS_PER_CALL, "Time taken to access an entry in the tree");
STAMUNIT_TICKS_PER_CALL, "Time taken to insert an entry in the tree");
STAMUNIT_TICKS_PER_CALL, "Time taken to remove an entry an the tree");
STAMUNIT_COUNT, "Number of times a buffer could be reused");
#ifndef VBOX_WITH_2Q_CACHE
"Adaption value of the cache");
#endif
#endif
/* Initialize the critical section */
if (RT_SUCCESS(rc))
return rc;
}
/**
* Destroysthe cache freeing all data.
*
* returns nothing.
* @param pClassFile The global class data for file endpoints.
*/
{
/* Make sure no one else uses the cache now */
#ifdef VBOX_WITH_2Q_CACHE
/* Cleanup deleting all cache entries waiting for in progress entries to finish. */
#else
/* Cleanup deleting all cache entries waiting for in progress entries to finish. */
#endif
}
/**
* Initializes per endpoint cache data
* like the AVL tree used to access cached entries.
*
* @returns VBox status code.
* @param pEndpoint The endpoint to init the cache for,
* @param pClassFile The global class data for file endpoints.
*/
int pdmacFileEpCacheInit(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMASYNCCOMPLETIONEPCLASSFILE pClassFile)
{
if (RT_SUCCESS(rc))
{
if (!pEndpointCache->pTree)
{
rc = VERR_NO_MEMORY;
}
}
#ifdef VBOX_WITH_STATISTICS
if (RT_SUCCESS(rc))
{
STAMUNIT_COUNT, "Number of deferred writes",
}
#endif
return rc;
}
/**
* Callback for the AVL destroy routine. Frees a cache entry for this endpoint.
*
* @returns IPRT status code.
* @param pNode The node to destroy.
* @param pvUser Opaque user data.
*/
{
{
RTThreadSleep(250);
}
AssertMsg(!(pEntry->fFlags & (PDMACFILECACHE_ENTRY_IO_IN_PROGRESS | PDMACFILECACHE_ENTRY_IS_DIRTY)),
return VINF_SUCCESS;
}
/**
* Destroys all cache ressources used by the given endpoint.
*
* @returns nothing.
* @param pEndpoint The endpoint to the destroy.
*/
{
/* Make sure nobody is accessing the cache while we delete the tree. */
#ifdef VBOX_WITH_STATISTICS
PPDMASYNCCOMPLETIONEPCLASSFILE pEpClassFile = (PPDMASYNCCOMPLETIONEPCLASSFILE)pEndpoint->Core.pEpClass;
#endif
}
static PPDMACFILECACHEENTRY pdmacFileEpCacheGetCacheEntryByOffset(PPDMACFILEENDPOINTCACHE pEndpointCache, RTFOFF off)
{
if (pEntry)
return pEntry;
}
static PPDMACFILECACHEENTRY pdmacFileEpCacheGetCacheBestFitEntryByOffset(PPDMACFILEENDPOINTCACHE pEndpointCache, RTFOFF off)
{
pEntry = (PPDMACFILECACHEENTRY)RTAvlrFileOffsetGetBestFit(pEndpointCache->pTree, off, true /*fAbove*/);
if (pEntry)
return pEntry;
}
static void pdmacFileEpCacheInsertEntry(PPDMACFILEENDPOINTCACHE pEndpointCache, PPDMACFILECACHEENTRY pEntry)
{
}
/**
* Allocates and initializes a new entry for the cache.
* The entry has a reference count of 1.
*
* @returns Pointer to the new cache entry or NULL if out of memory.
* @param pCache The cache the entry belongs to.
* @param pEndoint The endpoint the entry holds data for.
* @param off Start offset.
* @param cbData Size of the cache entry.
* @param pbBuffer Pointer to the buffer to use.
* NULL if a new buffer should be allocated.
* The buffer needs to have the same size of the entry.
*/
{
if (RT_UNLIKELY(!pEntryNew))
return NULL;
if (pbBuffer)
else
{
return NULL;
}
return pEntryNew;
}
/**
* Adds a segment to the waiting list for a cache entry
* which is currently in progress.
*
* @returns nothing.
* @param pEntry The cache entry to add the segment to.
* @param pSeg The segment to add.
*/
DECLINLINE(void) pdmacFileEpCacheEntryAddWaitingSegment(PPDMACFILECACHEENTRY pEntry, PPDMACFILETASKSEG pSeg)
{
if (pEntry->pWaitingHead)
{
}
else
{
}
}
/**
* in exclusive mode.
*
* @returns true if the flag in fSet is set and the one in fClear is clear.
* false othwerise.
* The R/W semaphore is only held if true is returned.
*
* @param pEndpointCache The endpoint cache instance data.
* @param pEntry The entry to check the flags for.
* @param fSet The flag which is tested to be set.
* @param fClear The flag which is tested to be clear.
*/
DECLINLINE(bool) pdmacFileEpCacheEntryFlagIsSetClearAcquireLock(PPDMACFILEENDPOINTCACHE pEndpointCache,
{
if (fPassed)
{
/* Acquire the lock and check again becuase the completion callback might have raced us. */
/* Drop the lock if we didn't passed the test. */
if (!fPassed)
}
return fPassed;
}
/**
* Advances the current segment buffer by the number of bytes transfered
* or gets the next segment.
*/
#define ADVANCE_SEGMENT_BUFFER(BytesTransfered) \
do \
{ \
cbSegLeft -= BytesTransfered; \
if (!cbSegLeft) \
{ \
iSegCurr++; \
} \
else \
pbSegBuf += BytesTransfered; \
} \
while (0)
/**
* Reads the specified data from the endpoint using the cache if possible.
*
* @returns VBox status code.
* @param pEndpoint The endpoint to read from.
* @param pTask The task structure used as identifier for this request.
* @param off The offset to start reading from.
* @param paSegments Pointer to the array holding the destination buffers.
* @param cSegments Number of segments in the array.
* @param cbRead Number of bytes to read.
*/
int pdmacFileEpCacheRead(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMASYNCCOMPLETIONTASKFILE pTask,
{
int rc = VINF_SUCCESS;
LogFlowFunc((": pEndpoint=%#p{%s} pTask=%#p off=%RTfoff paSegments=%#p cSegments=%u cbRead=%u\n",
/* Set to completed to make sure that the task is valid while we access it. */
int iSegCurr = 0;
while (cbRead)
{
/*
* If there is no entry we try to create a new one eviciting unused pages
* if the cache is full. If this is not possible we will pass the request through
* and skip the caching (all entries may be still in progress so they can't
* be evicted)
* If we have an entry it can be in one of the LRU lists where the entry
* contains data (recently used or frequently used LRU) so we can just read
* the data we need and put the entry at the head of the frequently used LRU list.
* In case the entry is in one of the ghost lists it doesn't contain any data.
* We have to fetch it again evicting pages from either T1 or T2 to make room.
*/
if (pEntry)
{
("Overflow in calculation off=%RTfoff OffsetAligned=%RTfoff\n",
if (!cbRead)
else
/* Ghost lists contain no data. */
#ifdef VBOX_WITH_2Q_CACHE
{
#else
{
#endif
0))
{
/* Entry is deprecated. Read data from the new buffer. */
while (cbToRead)
{
}
}
else
{
{
/* Entry didn't completed yet. Append to the list */
while (cbToRead)
{
}
}
else
{
/* Read as much as we can from the entry. */
while (cbToRead)
{
}
}
}
/* Move this entry to the top position */
#ifdef VBOX_WITH_2Q_CACHE
{
}
#else
#endif
}
else
{
#ifdef VBOX_WITH_2Q_CACHE
pdmacFileCacheEntryRemoveFromList(pEntry); /* Remove it before we remove data, otherwise it may get freed when evicting data. */
/* Move the entry to Am and fetch it to the cache. */
#else
/* Move the entry to T2 and fetch it to the cache. */
#endif
if (pbBuffer)
else
while (cbToRead)
{
("Overflow in calculation off=%RTfoff OffsetAligned=%RTfoff\n",
}
}
}
else
{
/* No entry found for this offset. Get best fit entry and fetch the data to the cache. */
PPDMACFILECACHEENTRY pEntryBestFit = pdmacFileEpCacheGetCacheBestFitEntryByOffset(pEndpointCache, off);
LogFlow(("%sbest fit entry for off=%RTfoff (BestFit=%RTfoff BestFitEnd=%RTfoff BestFitSize=%u)\n",
off,
if ( pEntryBestFit
{
}
else
{
/*
* Align the size to a 4KB boundary.
* Memory size is aligned to a page boundary
* and memory is wasted if the size is rahter small.
* (For example reads with a size of 512 bytes.
*/
/* Clip read to file size */
if (pEntryBestFit)
{
}
}
if (!cbRead)
else
#ifdef VBOX_WITH_2Q_CACHE
if (fEnough)
{
#else
if (cbRemoved >= cbToReadAligned)
{
LogFlow(("Evicted %u bytes (%u requested). Creating new cache entry\n", cbRemoved, cbToReadAligned));
#endif
PPDMACFILECACHEENTRY pEntryNew = pdmacFileCacheEntryAlloc(pCache, pEndpoint, off, cbToReadAligned, pbBuffer);
#ifdef VBOX_WITH_2Q_CACHE
#else
#endif
uint32_t uBufOffset = 0;
while (cbToRead)
{
}
}
else
{
/*
* There is not enough free space in the cache.
* Pass the request directly to the I/O manager.
*/
LogFlow(("Couldn't evict %u bytes from the cache. Remaining request will be passed through\n", cbToRead));
while (cbToRead)
{
/* Send it off to the I/O manager. */
}
}
}
}
else
return rc;
}
/**
* Writes the given data to the endpoint using the cache if possible.
*
* @returns VBox status code.
* @param pEndpoint The endpoint to write to.
* @param pTask The task structure used as identifier for this request.
* @param off The offset to start writing to
* @param paSegments Pointer to the array holding the source buffers.
* @param cSegments Number of segments in the array.
* @param cbWrite Number of bytes to write.
*/
int pdmacFileEpCacheWrite(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMASYNCCOMPLETIONTASKFILE pTask,
{
int rc = VINF_SUCCESS;
LogFlowFunc((": pEndpoint=%#p{%s} pTask=%#p off=%RTfoff paSegments=%#p cSegments=%u cbWrite=%u\n",
/* Set to completed to make sure that the task is valid while we access it. */
int iSegCurr = 0;
while (cbWrite)
{
if (pEntry)
{
/* Write the data into the entry and mark it as dirty */
("Overflow in calculation off=%RTfoff OffsetAligned=%RTfoff\n",
if (!cbWrite)
else
/* Ghost lists contain no data. */
#ifdef VBOX_WITH_2Q_CACHE
#else
#endif
{
/* Check if the buffer is deprecated. */
0))
{
("Entry is deprecated but not in progress\n"));
/* Update the data from the write. */
while (cbToWrite)
{
}
}
else /* Deprecated flag not set */
{
/* If the entry is dirty it must be also in progress now and we have to defer updating it again. */
0))
{
("Entry is dirty but not in progress\n"));
/* Deprecate the current buffer. */
if (!pEntry->pWaitingHead)
/* If we are out of memory or have waiting segments
* defer the write. */
{
/* The data isn't written to the file yet */
while (cbToWrite)
{
}
}
else /* Deprecate buffer */
{
#if 1
/* Copy the data before the update. */
if (OffDiff)
/* Copy data behind the update. */
#else
/* A safer method but probably slower. */
#endif
/* Update the data from the write. */
while (cbToWrite)
{
}
/* We are done here. A new write is initiated if the current request completes. */
}
}
else /* Dirty bit not set */
{
/*
* Check if a read is in progress for this entry.
* We have to defer processing in that case.
*/
0))
{
while (cbToWrite)
{
}
}
else /* I/O in progres flag not set */
{
/* Write as much as we can into the entry and update the file. */
while (cbToWrite)
{
}
}
} /* Dirty bit not set */
/* Move this entry to the top position */
#ifdef VBOX_WITH_2Q_CACHE
{
} /* Deprecated flag not set. */
#else
#endif
}
}
else /* Entry is on the ghost list */
{
#ifdef VBOX_WITH_2Q_CACHE
pdmacFileCacheEntryRemoveFromList(pEntry); /* Remove it before we remove data, otherwise it may get freed when evicting data. */
/* Move the entry to Am and fetch it to the cache. */
#else
/* Move the entry to T2 and fetch it to the cache. */
#endif
if (pbBuffer)
else
while (cbToWrite)
{
("Overflow in calculation off=%RTfoff OffsetAligned=%RTfoff\n",
}
}
/* Release the reference. If it is still needed the I/O in progress flag should protect it now. */
}
else /* No entry found */
{
/*
* No entry found. Try to create a new cache entry to store the data in and if that fails
* write directly to the file.
*/
PPDMACFILECACHEENTRY pEntryBestFit = pdmacFileEpCacheGetCacheBestFitEntryByOffset(pEndpointCache, off);
LogFlow(("%sest fit entry for off=%RTfoff (BestFit=%RTfoff BestFitEnd=%RTfoff BestFitSize=%u)\n",
off,
{
}
else
{
if (pEntryBestFit)
}
#ifdef VBOX_WITH_2Q_CACHE
if (fEnough)
{
#else
{
#endif
#ifdef VBOX_WITH_2Q_CACHE
#else
#endif
while (cbToWrite)
{
}
}
else
{
/*
* There is not enough free space in the cache.
* Pass the request directly to the I/O manager.
*/
LogFlow(("Couldn't evict %u bytes from the cache. Remaining request will be passed through\n", cbToWrite));
while (cbToWrite)
{
/* Send it off to the I/O manager. */
}
}
}
}
{
/* Complete a pending flush if all writes have completed */
{
PPDMASYNCCOMPLETIONTASKFILE pTaskFlush = (PPDMASYNCCOMPLETIONTASKFILE)ASMAtomicXchgPtr((void * volatile *)&pEndpointCache->pTaskFlush, NULL);
if (pTaskFlush)
}
}
else
return rc;
}
int pdmacFileEpCacheFlush(PPDMASYNCCOMPLETIONENDPOINTFILE pEndpoint, PPDMASYNCCOMPLETIONTASKFILE pTask)
{
int rc = VINF_SUCCESS;
LogFlowFunc((": pEndpoint=%#p{%s} pTask=%#p\n",
else
{
{
}
else
}
return rc;
}