utf-16.cpp revision 677833bc953b6cb418c701facbdcf4aa18d6c44e
/* $Id$ */
/** @file
* InnoTek Portable Runtime - UTF-16
*/
/*
* Copyright (C) 2006 InnoTek Systemberatung GmbH
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* General Public License as published by the Free Software Foundation,
* in version 2 as it comes in the "COPYING" file of the VirtualBox OSE
* distribution. VirtualBox OSE is distributed in the hope that it will
* be useful, but WITHOUT ANY WARRANTY of any kind.
*
* If you received this file as part of a commercial VirtualBox
* distribution, then only the terms of your commercial VirtualBox
* license agreement apply instead of the previous paragraph.
*/
/*******************************************************************************
* Header Files *
*******************************************************************************/
{
if (pwszString)
}
{
if (pwsz)
return pwsz;
}
{
if (pwsz)
{
*ppwszString = pwsz;
return VINF_SUCCESS;
}
return VERR_NO_MEMORY;
}
{
if (!pwszString)
return 0;
while (*pwsz)
pwsz++;
return pwsz - pwszString;
}
{
return 0;
if (!pwsz1)
return -1;
if (!pwsz2)
return 1;
for (;;)
{
return iDiff;
pwsz1++;
pwsz2++;
}
}
{
return 0;
if (!pwsz1)
return -1;
if (!pwsz2)
return 1;
for (;;)
{
if (iDiff)
{
/* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
if ( wc1 < 0xd800
|| wc2 < 0xd800
|| wc1 > 0xdfff
|| wc2 > 0xdfff)
{
/* simple UCS-2 char */
if (iDiff)
}
else
{
/* a damned pair */
if (wc1 >= 0xdc00)
{
if (pwsz1Start == pwsz1)
return iDiff;
return iDiff;
}
else
{
return iDiff;
}
if (iDiff)
}
if (iDiff)
return iDiff;
}
if (!wc1)
return 0;
pwsz1++;
pwsz2++;
}
}
{
for (;;)
{
if (!wc)
break;
{
if (ucFolded < 0x10000)
}
else
{
/* surrogate */
{
{
uc -= 0x10000;
}
}
else /* invalid encoding. */
pwc++;
}
}
return pwsz;
}
{
for (;;)
{
if (!wc)
break;
else
{
/* surrogate */
{
{
uc -= 0x10000;
}
}
else /* invalid encoding. */
pwc++;
}
}
return pwsz;
}
/**
* Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
*
* @returns iprt status code.
* @param pwsz The UTF-16 string.
* @param cwc The max length of the UTF-16 string to consider.
* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
*/
{
int rc = VINF_SUCCESS;
while (cwc > 0)
{
if (!wc)
break;
{
if (wc < 0x80)
cch++;
else if (wc < 0x800)
cch += 2;
else if (wc < 0xfffe)
cch += 3;
else
{
break;
}
}
else
{
if (wc >= 0xdc00)
{
break;
}
if (cwc <= 0)
{
break;
}
{
break;
}
cch += 4;
}
}
/* done */
return rc;
}
/**
* Recodes an valid UTF-16 string as UTF-8.
*
* @returns iprt status code.
* @param pwsz The UTF-16 string.
* @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
* will stop when cwc or '\\0' is reached.
* @param psz Where to store the UTF-8 string.
* @param cch The size of the UTF-8 buffer, excluding the terminator.
* @param pcch Where to store the number of octets actually encoded.
*/
{
int rc = VINF_SUCCESS;
while (cwc > 0)
{
if (!wc)
break;
{
if (wc < 0x80)
{
if (cch < 1)
{
RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
break;
}
cch--;
}
else if (wc < 0x800)
{
if (cch < 2)
{
RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
break;
}
cch -= 2;
}
else if (wc < 0xfffe)
{
if (cch < 3)
{
RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
break;
}
cch -= 3;
}
else
{
break;
}
}
else
{
if (wc >= 0xdc00)
{
break;
}
if (cwc <= 0)
{
break;
}
{
break;
}
| (wc2 & 0x3ff));
if (cch < 4)
{
RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
break;
}
cch -= 4;
}
}
/* done */
*pwch = '\0';
return rc;
}
{
/*
* Validate input.
*/
*ppszString = NULL;
/*
* Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
*/
if (RT_SUCCESS(rc))
{
/*
* Allocate buffer and recode it.
*/
if (pszResult)
{
if (RT_SUCCESS(rc))
{
*ppszString = pszResult;
return rc;
}
}
else
}
return rc;
}
RTDECL(int) RTUtf16ToUtf8Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
{
/*
* Validate input.
*/
/*
* Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
*/
if (RT_SUCCESS(rc))
{
if (pcch)
/*
* Check buffer size / Allocate buffer and recode it.
*/
bool fShouldFree;
char *pszResult;
{
fShouldFree = false;
return VERR_BUFFER_OVERFLOW;
}
else
{
fShouldFree = true;
}
if (pszResult)
{
if (RT_SUCCESS(rc))
{
return rc;
}
if (fShouldFree)
}
else
}
return rc;
}
{
/* simple */
return wc;
if (wc < 0xfffe)
{
/* surrogate pair */
if (wc < 0xdc00)
{
{
return uc;
}
}
else
}
else
return RTUNICP_INVALID;
}
{
/* simple */
{
(*ppwsz)++;
return VINF_SUCCESS;
}
int rc;
if (wc < 0xfffe)
{
/* surrogate pair */
if (wc < 0xdc00)
{
{
(*ppwsz) += 2;
return VINF_SUCCESS;
}
}
else
}
else
{
}
*pCp = RTUNICP_INVALID;
(*ppwsz)++;
return rc;
}
{
/* simple */
if ( CodePoint < 0xd800
|| ( CodePoint > 0xdfff
&& CodePoint < 0xfffe))
{
return pwsz;
}
/* surrogate pair */
{
CodePoint -= 0x10000;
return pwsz;
}
/* invalid code point. */
*pwsz++ = 0x7f;
return pwsz;
}