utf-16.cpp revision f52f0ffe78e35402d04a595087e9f777a2f935fd
/* $Id$ */
/** @file
* IPRT - UTF-16.
*/
/*
* Copyright (C) 2006-2007 Sun Microsystems, Inc.
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*
* The contents of this file may alternatively be used under the terms
* of the Common Development and Distribution License Version 1.0
* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
* VirtualBox OSE distribution, in which case the provisions of the
* CDDL are applicable instead of those of the GPL.
*
* You may elect to license modified versions of this file under the
* terms and conditions of either the GPL or the CDDL or both.
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
* Clara, CA 95054 USA or visit http://www.sun.com if you need
* additional information or have any questions.
*/
/*******************************************************************************
* Header Files *
*******************************************************************************/
{
if (pwszString)
}
{
if (pwsz)
return pwsz;
}
{
if (pwsz)
{
*ppwszString = pwsz;
return VINF_SUCCESS;
}
return VERR_NO_MEMORY;
}
{
if (!pwszString)
return 0;
while (*pwsz)
pwsz++;
return pwsz - pwszString;
}
{
return 0;
if (!pwsz1)
return -1;
if (!pwsz2)
return 1;
for (;;)
{
return iDiff;
pwsz1++;
pwsz2++;
}
}
{
return 0;
if (!pwsz1)
return -1;
if (!pwsz2)
return 1;
for (;;)
{
if (iDiff)
{
/* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
if ( wc1 < 0xd800
|| wc2 < 0xd800
|| wc1 > 0xdfff
|| wc2 > 0xdfff)
{
/* simple UCS-2 char */
if (iDiff)
}
else
{
/* a damned pair */
if (wc1 >= 0xdc00)
{
if (pwsz1Start == pwsz1)
return iDiff;
return iDiff;
}
else
{
return iDiff;
}
if (iDiff)
}
if (iDiff)
return iDiff;
}
if (!wc1)
return 0;
pwsz1++;
pwsz2++;
}
}
{
for (;;)
{
if (!wc)
break;
{
if (ucFolded < 0x10000)
}
else
{
/* surrogate */
{
{
uc -= 0x10000;
}
}
else /* invalid encoding. */
pwc++;
}
}
return pwsz;
}
{
for (;;)
{
if (!wc)
break;
else
{
/* surrogate */
{
{
uc -= 0x10000;
}
}
else /* invalid encoding. */
pwc++;
}
}
return pwsz;
}
/**
* Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
*
* @returns iprt status code.
* @param pwsz The UTF-16 string.
* @param cwc The max length of the UTF-16 string to consider.
* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
*/
{
int rc = VINF_SUCCESS;
while (cwc > 0)
{
if (!wc)
break;
{
if (wc < 0x80)
cch++;
else if (wc < 0x800)
cch += 2;
else if (wc < 0xfffe)
cch += 3;
else
{
break;
}
}
else
{
if (wc >= 0xdc00)
{
break;
}
if (cwc <= 0)
{
break;
}
{
break;
}
cch += 4;
}
}
/* done */
return rc;
}
/**
* Recodes an valid UTF-16 string as UTF-8.
*
* @returns iprt status code.
* @param pwsz The UTF-16 string.
* @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
* will stop when cwc or '\\0' is reached.
* @param psz Where to store the UTF-8 string.
* @param cch The size of the UTF-8 buffer, excluding the terminator.
* @param pcch Where to store the number of octets actually encoded.
*/
{
int rc = VINF_SUCCESS;
while (cwc > 0)
{
if (!wc)
break;
{
if (wc < 0x80)
{
{
RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
break;
}
cch--;
}
else if (wc < 0x800)
{
{
RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
break;
}
cch -= 2;
}
else if (wc < 0xfffe)
{
{
RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
break;
}
cch -= 3;
}
else
{
break;
}
}
else
{
if (wc >= 0xdc00)
{
break;
}
if (cwc <= 0)
{
break;
}
{
break;
}
| (wc2 & 0x3ff));
{
RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
break;
}
cch -= 4;
}
}
/* done */
*pwch = '\0';
return rc;
}
{
/*
* Validate input.
*/
*ppszString = NULL;
/*
* Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
*/
if (RT_SUCCESS(rc))
{
/*
* Allocate buffer and recode it.
*/
if (pszResult)
{
if (RT_SUCCESS(rc))
{
*ppszString = pszResult;
return rc;
}
}
else
}
return rc;
}
RTDECL(int) RTUtf16ToUtf8Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
{
/*
* Validate input.
*/
/*
* Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
*/
if (RT_SUCCESS(rc))
{
if (pcch)
/*
* Check buffer size / Allocate buffer and recode it.
*/
bool fShouldFree;
char *pszResult;
{
fShouldFree = false;
return VERR_BUFFER_OVERFLOW;
}
else
{
fShouldFree = true;
}
if (pszResult)
{
if (RT_SUCCESS(rc))
{
return rc;
}
if (fShouldFree)
}
else
}
return rc;
}
{
}
{
if (pcch)
return rc;
}
{
/* simple */
return wc;
if (wc < 0xfffe)
{
/* surrogate pair */
if (wc < 0xdc00)
{
{
return uc;
}
}
else
}
else
return RTUNICP_INVALID;
}
{
/* simple */
{
(*ppwsz)++;
return VINF_SUCCESS;
}
int rc;
if (wc < 0xfffe)
{
/* surrogate pair */
if (wc < 0xdc00)
{
{
(*ppwsz) += 2;
return VINF_SUCCESS;
}
}
else
}
else
{
}
*pCp = RTUNICP_INVALID;
(*ppwsz)++;
return rc;
}
{
/* simple */
if ( CodePoint < 0xd800
|| ( CodePoint > 0xdfff
&& CodePoint < 0xfffe))
{
return pwsz;
}
/* surrogate pair */
{
CodePoint -= 0x10000;
return pwsz;
}
/* invalid code point. */
*pwsz++ = 0x7f;
return pwsz;
}
/**
* Validate the UTF-16 encoding and calculates the length of a Latin1 encoding.
*
* @returns iprt status code.
* @param pwsz The UTF-16 string.
* @param cwc The max length of the UTF-16 string to consider.
* @param pcch Where to store the length (excluding '\\0') of the Latin1 string. (cch == cb, btw)
*/
{
int rc = VINF_SUCCESS;
while (cwc > 0)
{
if (!wc)
break;
++cch;
else
{
{
if (wc >= 0xfffe)
{
break;
}
}
else
{
if (wc >= 0xdc00)
{
break;
}
if (cwc <= 0)
{
break;
}
{
break;
}
}
break;
}
}
/* done */
return rc;
}
/**
* Recodes an valid UTF-16 string as Latin1.
*
* @returns iprt status code.
* @param pwsz The UTF-16 string.
* @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
* will stop when cwc or '\\0' is reached.
* @param psz Where to store the Latin1 string.
* @param cch The size of the Latin1 buffer, excluding the terminator.
*/
{
int rc = VINF_SUCCESS;
while (cwc > 0)
{
if (!wc)
break;
{
{
RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
break;
}
cch--;
}
else
{
{
if (wc >= 0xfffe)
{
break;
}
}
else
{
if (wc >= 0xdc00)
{
break;
}
if (cwc <= 0)
{
break;
}
{
break;
}
}
break;
}
}
/* done */
*pch = '\0';
return rc;
}
{
/*
* Validate input.
*/
*ppszString = NULL;
/*
* Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
*/
if (RT_SUCCESS(rc))
{
/*
* Allocate buffer and recode it.
*/
if (pszResult)
{
if (RT_SUCCESS(rc))
{
*ppszString = pszResult;
return rc;
}
}
else
}
return rc;
}
RTDECL(int) RTUtf16ToLatin1Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
{
/*
* Validate input.
*/
/*
* Validate the UTF-16 string and calculate the length of the Latin1 encoding of it.
*/
if (RT_SUCCESS(rc))
{
if (pcch)
/*
* Check buffer size / Allocate buffer and recode it.
*/
bool fShouldFree;
char *pszResult;
{
fShouldFree = false;
return VERR_BUFFER_OVERFLOW;
}
else
{
fShouldFree = true;
}
if (pszResult)
{
if (RT_SUCCESS(rc))
{
return rc;
}
if (fShouldFree)
}
else
}
return rc;
}
{
}
{
if (pcch)
return rc;
}
/**
* Calculates the UTF-16 length of a Latin1 string. In fact this is just the
* original length, but the function saves us nasty comments to that effect
* all over the place.
*
* @returns IPRT status code.
* @param psz Pointer to the Latin1 string.
* @param cch The max length of the string. (btw cch = cb)
* Use RTSTR_MAX if all of the string is to be examined.s
* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
*/
{
return VINF_SUCCESS;
}
/**
* Recodes a Latin1 string as UTF-16. This is just a case of expanding it to
* sixteen bits, as Unicode is a superset of Latin1.
*
* Since we know the input is valid, we do *not* perform length checks.
*
* @returns iprt status code.
* @param psz The Latin1 string to recode.
* @param cch The number of chars (the type char, so bytes if you like) to process of the Latin1 string.
* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
* @param pwsz Where to store the UTF-16 string.
* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
*/
{
int rc = VINF_SUCCESS;
while (cch-- > 0)
{
/* read the next char and check for terminator. */
if (!uch)
break;
/* check for output overflow */
{
break;
}
/* expand the code point */
cwc--;
puch++;
}
/* done */
*pwc = '\0';
return rc;
}
{
/*
* Validate input.
*/
*ppwszString = NULL;
/*
* Validate the input and calculate the length of the UTF-16 string.
*/
if (RT_SUCCESS(rc))
{
/*
* Allocate buffer.
*/
if (pwsz)
{
/*
* Encode the UTF-16 string.
*/
if (RT_SUCCESS(rc))
{
*ppwszString = pwsz;
return rc;
}
}
else
}
return rc;
}
RTDECL(int) RTLatin1ToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
{
/*
* Validate input.
*/
/*
* Validate the input and calculate the length of the UTF-16 string.
*/
if (RT_SUCCESS(rc))
{
if (pcwc)
/*
* Check buffer size / Allocate buffer.
*/
bool fShouldFree;
{
fShouldFree = false;
return VERR_BUFFER_OVERFLOW;
pwszResult = *ppwsz;
}
else
{
fShouldFree = true;
}
if (pwszResult)
{
/*
* Encode the UTF-16 string.
*/
if (RT_SUCCESS(rc))
{
*ppwsz = pwszResult;
return rc;
}
if (fShouldFree)
}
else
}
return rc;
}
{
}
{
if (pcwc)
return rc;
}