FTM.cpp revision c7814cf6e1240a519cbec0441e033d0e2470ed00
/* $Id$ */
/** @file
* FTM - Fault Tolerance Manager
*/
/*
* Copyright (C) 2010-2013 Oracle Corporation
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*/
/*******************************************************************************
* Header Files *
*******************************************************************************/
#define LOG_GROUP LOG_GROUP_FTM
#include "FTMInternal.h"
#include <iprt/semaphore.h>
/*******************************************************************************
* Structures and Typedefs *
*******************************************************************************/
/**
* TCP stream header.
*
* This is an extra layer for fixing the problem with figuring out when the SSM
* stream ends.
*/
typedef struct FTMTCPHDR
{
/** Magic value. */
/** The size of the data block following this header.
* 0 indicates the end of the stream, while UINT32_MAX indicates
* cancelation. */
} FTMTCPHDR;
/** Magic value for FTMTCPHDR::u32Magic. (Egberto Gismonti Amin) */
/** The max block size. */
/**
* TCP stream header.
*
* This is an extra layer for fixing the problem with figuring out when the SSM
* stream ends.
*/
typedef struct FTMTCPHDRMEM
{
/** Magic value. */
/** Size (Uncompressed) of the pages following the header. */
/** GC Physical address of the page(s) to sync. */
/** The size of the data block following this header.
* 0 indicates the end of the stream, while UINT32_MAX indicates
* cancelation. */
} FTMTCPHDRMEM;
/*******************************************************************************
* Global Variables *
*******************************************************************************/
static const char g_szWelcome[] = "VirtualBox-Fault-Tolerance-Sync-1.0\n";
/**
* Initializes the FTM.
*
* @returns VBox status code.
* @param pVM Pointer to the VM.
*/
{
/*
* Assert alignment and sizes.
*/
/** @todo saved state for master nodes! */
pVM->fFaultTolerantMaster = false;
/*
* Initialize the PGM critical section.
*/
/*
* Register statistics.
*/
STAM_REL_REG(pVM, &pVM->ftm.s.StatReceivedMem, STAMTYPE_COUNTER, "/FT/Received/Mem", STAMUNIT_BYTES, "The amount of memory pages that was received.");
STAM_REL_REG(pVM, &pVM->ftm.s.StatReceivedState, STAMTYPE_COUNTER, "/FT/Received/State", STAMUNIT_BYTES, "The amount of state information that was received.");
STAM_REL_REG(pVM, &pVM->ftm.s.StatSentMem, STAMTYPE_COUNTER, "/FT/Sent/Mem", STAMUNIT_BYTES, "The amount of memory pages that was sent.");
STAM_REL_REG(pVM, &pVM->ftm.s.StatSentState, STAMTYPE_COUNTER, "/FT/Sent/State", STAMUNIT_BYTES, "The amount of state information that was sent.");
STAM_REL_REG(pVM, &pVM->ftm.s.StatDeltaVM, STAMTYPE_COUNTER, "/FT/Sync/DeltaVM", STAMUNIT_OCCURENCES, "Number of delta vm syncs.");
STAM_REL_REG(pVM, &pVM->ftm.s.StatFullSync, STAMTYPE_COUNTER, "/FT/Sync/Full", STAMUNIT_OCCURENCES, "Number of full vm syncs.");
STAM_REL_REG(pVM, &pVM->ftm.s.StatDeltaMem, STAMTYPE_COUNTER, "/FT/Sync/DeltaMem", STAMUNIT_OCCURENCES, "Number of delta mem syncs.");
STAM_REL_REG(pVM, &pVM->ftm.s.StatCheckpointStorage, STAMTYPE_COUNTER, "/FT/Checkpoint/Storage", STAMUNIT_OCCURENCES, "Number of storage checkpoints.");
STAM_REL_REG(pVM, &pVM->ftm.s.StatCheckpointNetwork, STAMTYPE_COUNTER, "/FT/Checkpoint/Network", STAMUNIT_OCCURENCES, "Number of network checkpoints.");
#ifdef VBOX_WITH_STATISTICS
STAM_REG(pVM, &pVM->ftm.s.StatCheckpoint, STAMTYPE_PROFILE, "/FT/Checkpoint", STAMUNIT_TICKS_PER_CALL, "Profiling of FTMR3SetCheckpoint.");
STAM_REG(pVM, &pVM->ftm.s.StatCheckpointPause, STAMTYPE_PROFILE, "/FT/Checkpoint/Pause", STAMUNIT_TICKS_PER_CALL, "Profiling of FTMR3SetCheckpoint.");
STAM_REG(pVM, &pVM->ftm.s.StatCheckpointResume, STAMTYPE_PROFILE, "/FT/Checkpoint/Resume", STAMUNIT_TICKS_PER_CALL, "Profiling of FTMR3SetCheckpoint.");
STAM_REG(pVM, &pVM->ftm.s.StatSentMemRAM, STAMTYPE_COUNTER, "/FT/Sent/Mem/RAM", STAMUNIT_BYTES, "The amount of memory pages that was sent.");
STAM_REG(pVM, &pVM->ftm.s.StatSentMemMMIO2, STAMTYPE_COUNTER, "/FT/Sent/Mem/MMIO2", STAMUNIT_BYTES, "The amount of memory pages that was sent.");
STAM_REG(pVM, &pVM->ftm.s.StatSentMemShwROM, STAMTYPE_COUNTER, "/FT/Sent/Mem/ShwROM", STAMUNIT_BYTES, "The amount of memory pages that was sent.");
STAM_REG(pVM, &pVM->ftm.s.StatSentStateWrite, STAMTYPE_COUNTER, "/FT/Sent/State/Writes", STAMUNIT_BYTES, "The nr of write calls.");
#endif
return VINF_SUCCESS;
}
/**
* Terminates the FTM.
*
* Termination means cleaning up and freeing all resources,
* the VM itself is at this point powered off or suspended.
*
* @returns VBox status code.
* @param pVM Pointer to the VM.
*/
{
{
}
{
}
{
}
/* Remove all pending memory updates. */
{
}
return VINF_SUCCESS;
}
{
if (RT_FAILURE(rc))
{
}
return rc;
}
{
char szMsg[256];
if (pszMsgText && *pszMsgText)
{
}
else
if (RT_FAILURE(rc))
return rc;
}
/**
* Reads a string from the socket.
*
* @returns VBox status code.
*
* @param pState The teleporter state structure.
* @param pszBuf The output buffer.
* @param cchBuf The size of the output buffer.
*
*/
{
*pszBuf = '\0';
/* dead simple approach. */
for (;;)
{
char ch;
if (RT_FAILURE(rc))
{
return rc;
}
if ( ch == '\n'
|| ch == '\0')
return VINF_SUCCESS;
if (cchBuf <= 1)
{
return VERR_BUFFER_OVERFLOW;
}
*pszBuf = '\0';
cchBuf--;
}
}
/**
* Reads an ACK or NACK.
*
* @returns VBox status code.
* @param pVM Pointer to the VM.
* @param pszWhich Which ACK is this this?
* @param pszNAckMsg Optional NACK message.
*/
{
char szMsg[256];
if (RT_FAILURE(rc))
return rc;
return VINF_SUCCESS;
{
if (pszMsgText)
*pszMsgText++ = '\0';
if (rc == VINF_SUCCESS)
{
/*
* Well formed NACK, transform it into an error.
*/
if (pszNAckMsg)
{
return VERR_INTERNAL_ERROR;
}
if (pszMsgText)
{
}
return VERR_INTERNAL_ERROR_2;
}
if (pszMsgText)
}
return VERR_INTERNAL_ERROR_3;
}
/**
* Submitts a command to the destination and waits for the ACK.
*
* @returns VBox status code.
*
* @param pVM Pointer to the VM.
* @param pszCommand The command.
* @param fWaitForAck Whether to wait for the ACK.
*/
{
int rc = RTTcpSgWriteL(pVM->ftm.s.hSocket, 2, pszCommand, strlen(pszCommand), "\n", sizeof("\n") - 1);
if (RT_FAILURE(rc))
return rc;
if (!fWaitForAck)
return VINF_SUCCESS;
}
/**
* @copydoc SSMSTRMOPS::pfnWrite
*/
static DECLCALLBACK(int) ftmR3TcpOpWrite(void *pvUser, uint64_t offStream, const void *pvBuf, size_t cbToWrite)
{
for (;;)
{
if (RT_FAILURE(rc))
{
return rc;
}
return VINF_SUCCESS;
/* advance */
}
}
/**
* Selects and poll for close condition.
*
* We can use a relatively high poll timeout here since it's only used to get
* us out of error paths. In the normal cause of events, we'll get a
* end-of-stream header.
*
* @returns VBox status code.
*
* @param pState The teleporter state data.
*/
{
int rc;
do
{
{
break;
}
{
break;
}
} while (rc == VERR_TIMEOUT);
return rc;
}
/**
* @copydoc SSMSTRMOPS::pfnRead
*/
static DECLCALLBACK(int) ftmR3TcpOpRead(void *pvUser, uint64_t offStream, void *pvBuf, size_t cbToRead, size_t *pcbRead)
{
for (;;)
{
int rc;
/*
* Check for various conditions and may have been signalled.
*/
return VERR_EOF;
return VERR_EOF;
return VERR_IO_GEN_FAILURE;
/*
* If there is no more data in the current block, read the next
* block header.
*/
{
if (RT_FAILURE(rc))
return rc;
if (RT_FAILURE(rc))
{
return rc;
}
{
)
{
}
return VERR_IO_GEN_FAILURE;
}
return VERR_EOF;
}
/*
* Read more data.
*/
if (RT_FAILURE(rc))
return rc;
if (RT_FAILURE(rc))
{
return rc;
}
if (pcbRead)
{
return VINF_SUCCESS;
}
return VINF_SUCCESS;
/* Advance to the next block. */
}
}
/**
* @copydoc SSMSTRMOPS::pfnSeek
*/
static DECLCALLBACK(int) ftmR3TcpOpSeek(void *pvUser, int64_t offSeek, unsigned uMethod, uint64_t *poffActual)
{
return VERR_NOT_SUPPORTED;
}
/**
* @copydoc SSMSTRMOPS::pfnTell
*/
{
}
/**
* @copydoc SSMSTRMOPS::pfnSize
*/
{
return VERR_NOT_SUPPORTED;
}
/**
* @copydoc SSMSTRMOPS::pfnIsOk
*/
{
if (pVM->fFaultTolerantMaster)
{
/* Poll for incoming NACKs and errors from the other side */
if (rc != VERR_TIMEOUT)
{
if (RT_SUCCESS(rc))
{
}
else
return rc;
}
}
return VINF_SUCCESS;
}
/**
* @copydoc SSMSTRMOPS::pfnClose
*/
{
if (pVM->fFaultTolerantMaster)
{
if (RT_FAILURE(rc))
{
return rc;
}
}
else
{
}
return VINF_SUCCESS;
}
/**
* Method table for a TCP based stream.
*/
static SSMSTRMOPS const g_ftmR3TcpOps =
{
};
/**
* VMR3ReqCallWait callback
*
* @param pVM Pointer to the VM.
*
*/
{
}
/**
* Sync the VM state
*
* @returns VBox status code.
* @param pVM Pointer to the VM.
*/
{
bool fSuspended = false;
/* Reset the sync state. */
/* Write protect all memory. */
return rc;
}
/**
* PGMR3PhysEnumDirtyFTPages callback for syncing dirty physical pages
*
* @param pVM Pointer to the VM.
* @param GCPhys GC physical address
* @param pRange HC virtual address of the page(s)
* @param cbRange Size of the dirty range in bytes.
* @param pvUser User argument
*/
static DECLCALLBACK(int) ftmR3SyncDirtyPage(PVM pVM, RTGCPHYS GCPhys, uint8_t *pRange, unsigned cbRange, void *pvUser)
{
/** @todo compress page(s). */
if (RT_FAILURE(rc))
{
return rc;
}
#ifdef VBOX_WITH_STATISTICS
{
case PGMPAGETYPE_RAM:
break;
case PGMPAGETYPE_MMIO2:
break;
case PGMPAGETYPE_ROM_SHADOW:
break;
AssertFailed();
break;
default:
AssertFailed();
break;
}
#endif
}
/**
* Thread function which starts syncing process for this master VM
*
* @param hThread The thread handle.
* @param pvUser Pointer to the VM.
* @return VINF_SUCCESS (ignored).
*
*/
{
int rc = VINF_SUCCESS;
for (;;)
{
/*
* Try connect to the standby machine.
*/
if (RT_SUCCESS(rc))
{
Log(("ftmR3MasterThread: CONNECTED\n"));
/* Disable Nagle. */
/* Read and check the welcome message. */
if ( RT_SUCCESS(rc)
{
/* password */
if (RT_SUCCESS(rc))
{
/* ACK */
if (RT_SUCCESS(rc))
{
/** todo: verify VM config. */
break;
}
}
}
/* Failed, so don't bother anymore. */
return VINF_SUCCESS;
}
if (rc != VERR_TIMEOUT)
return VINF_SUCCESS; /* told to quit */
}
/* Successfully initialized the connection to the standby node.
* Start the sync process.
*/
/* First sync all memory and write protect everything so
* we can send changed pages later on.
*/
for (;;)
{
if (rc != VERR_TIMEOUT)
break; /* told to quit */
{
/* sync the changed memory with the standby node. */
/* Write protect all memory. */
{
}
/* Enumerate all dirty pages and send them to the standby VM. */
{
}
/* Send last memory header to signal the end. */
Hdr.cbPageRange = 0;
if (RT_FAILURE(rc))
}
}
return rc;
}
/**
* Syncs memory from the master VM
*
* @returns VBox status code.
* @param pVM Pointer to the VM.
*/
{
while (true)
{
/* Read memory header. */
if (RT_FAILURE(rc))
{
break;
}
break; /* end of sync. */
/* Must be a multiple of PAGE_SIZE. */
while (Hdr.cbPageRange)
{
PFTMPHYSPAGETREENODE pNode = (PFTMPHYSPAGETREENODE)RTAvlGCPhysGet(&pVM->ftm.s.standby.pPhysPageTree, GCPhys);
if (!pNode)
{
/* Allocate memory for the node and page. */
/* Insert the node into the tree. */
}
/* Fetch the page. */
if (RT_FAILURE(rc))
{
break;
}
}
}
return VINF_SUCCESS;
}
/**
* Callback handler for RTAvlGCPhysDestroy
*
* @returns 0 to continue, otherwise stop
* @param pBaseNode Node to destroy
* @param pvUser Pointer to the VM.
*/
{
if (pVM) /* NULL when the VM is destroyed. */
{
/* Update the guest memory of the standby VM. */
}
return 0;
}
/**
* Thread function which monitors the health of the master VM
*
* @param hThread The thread handle.
* @param pvUser Pointer to the VM.
* @return VINF_SUCCESS (ignored).
*
*/
{
for (;;)
{
if (rc != VERR_TIMEOUT)
break; /* told to quit */
{
u64TimeNow = RTTimeMilliTS();
{
/* Timeout; prepare to fallover. */
LogRel(("FTSync: TIMEOUT (%RX64 vs %RX64 ms): activate standby VM!\n", u64TimeNow, pVM->ftm.s.standby.u64LastHeartbeat + pVM->ftm.s.uInterval * 2));
/** todo: prevent split-brain. */
break;
}
}
}
return VINF_SUCCESS;
}
/**
* Listen for incoming traffic destined for the standby VM.
*
* @copydoc FNRTTCPSERVE
*
* @returns VINF_SUCCESS or VERR_TCP_SERVER_STOP.
*/
{
/*
* Disable Nagle.
*/
/* Send the welcome message to the master node. */
if (RT_FAILURE(rc))
{
return VINF_SUCCESS;
}
/*
* Password.
*/
if (pszPassword)
{
unsigned off = 0;
while (pszPassword[off])
{
char ch;
if ( RT_FAILURE(rc)
{
if (RT_FAILURE(rc))
else
return VINF_SUCCESS;
}
off++;
}
}
if (RT_FAILURE(rc))
return VINF_SUCCESS;
/** @todo verify VM config. */
/*
* Stop the server.
*
* Note! After this point we must return VERR_TCP_SERVER_STOP, while prior
* to it we must not return that value!
*/
/*
* Command processing loop.
*/
//bool fDone = false;
for (;;)
{
bool fFullSync = false;
char szCmd[128];
if (RT_FAILURE(rc))
break;
{
if (RT_FAILURE(rc))
continue;
}
else
|| (fFullSync = true)) /* intended assignment */
{
if (RT_FAILURE(rc))
continue;
/* Flush all pending memory updates. */
{
}
/* Reset the sync state. */
if (RT_FAILURE(rc))
{
continue;
}
/* The EOS might not have been read, make sure it is. */
{
continue;
}
}
}
return VERR_TCP_SERVER_STOP;
}
/**
* Powers on the fault tolerant virtual machine.
*
* @returns VBox status code.
*
* @param pUVM The user mode VM handle.
* @param fMaster FT master or standby
* @param uInterval FT sync interval
* @param pszAddress Standby VM address
* @param uPort Standby VM port
* @param pszPassword FT password (NULL for none)
*
* @thread Any thread.
* @vmstate Created
* @vmstateto PoweringOn+Running (master), PoweringOn+Running_FT (standby)
*/
{
else
if (pszPassword)
if (RT_FAILURE(rc))
return rc;
if (fMaster)
{
if (RT_FAILURE(rc))
return rc;
pVM->fFaultTolerantMaster = true;
if (PGMIsUsingLargePages(pVM))
{
/* Must disable large page usage as 2 MB pages are too big to write monitor. */
LogRel(("FTSync: disabling large page usage.\n"));
PGMSetLargePageUsage(pVM, false);
}
/** @todo might need to disable page fusion as well */
}
/* standby */
0, RTTHREADTYPE_DEFAULT, 0, "ftmStandby");
if (RT_FAILURE(rc))
return rc;
if (RT_FAILURE(rc))
return rc;
/** @todo deal with the exit code to check if we should activate this standby VM. */
{
/** @todo fallover. */
}
{
}
if (rc == VERR_TCP_SERVER_SHUTDOWN)
return rc;
}
/**
* Powers off the fault tolerant virtual machine (standby).
*
* @returns VBox status code.
*
* @param pUVM The user mode VM handle.
*/
{
}
/**
* Rendezvous callback used by FTMR3SetCheckpoint
* Sync state + changed memory with the standby node.
*
* This is only called on one of the EMTs while the other ones are waiting for
* it to complete this function.
*
* @returns VINF_SUCCESS (VBox strict status code).
* @param pVM Pointer to the VM.
* @param pVCpu The VMCPU for the EMT we're being called on. Unused.
* @param pvUser Not used.
*/
{
int rc = VINF_SUCCESS;
bool fSuspended = false;
/* We don't call VMR3Suspend here to avoid the overhead of state changes and notifications. This
* is only a short suspend.
*/
/* Hack alert: as EM is responsible for dealing with the suspend state. We must do this here ourselves, but only for this EMT.*/
/* Reset the sync state. */
/* Write protect all memory. */
/* We don't call VMR3Resume here to avoid the overhead of state changes and notifications. This
* is only a short suspend.
*/
/* Hack alert as EM is responsible for dealing with the suspend state. We must do this here ourselves, but only for this EMT.*/
return rc;
}
/**
* Performs a full sync to the standby node
*
* @returns VBox status code.
*
* @param pVM Pointer to the VM.
* @param enmCheckpoint Checkpoint type
*/
{
int rc;
if (!pVM->fFaultTolerantMaster)
return VINF_SUCCESS;
switch (enmCheckpoint)
{
break;
break;
default:
}
{
/* We must take special care here as the memory sync is competing with us and requires a responsive EMT. */
{
{
}
{
}
}
}
else
return rc;
}