utf-8.cpp revision c58f1213e628a545081c70e26c6b67a841cff880
/* $Id$ */
/** @file
* IPRT - UTF-8 Decoding.
*/
/*
* Copyright (C) 2006-2012 Oracle Corporation
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* you can redistribute it and/or modify it under the terms of the GNU
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*
* The contents of this file may alternatively be used under the terms
* of the Common Development and Distribution License Version 1.0
* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
* VirtualBox OSE distribution, in which case the provisions of the
* CDDL are applicable instead of those of the GPL.
*
* You may elect to license modified versions of this file under the
* terms and conditions of either the GPL or the CDDL or both.
*/
/*******************************************************************************
* Header Files *
*******************************************************************************/
#include <iprt/string.h>
#include "internal/iprt.h"
#include <iprt/uni.h>
#include <iprt/alloc.h>
#include <iprt/assert.h>
#include <iprt/err.h>
#include "internal/string.h"
/**
* Get get length in code points of a UTF-8 encoded string.
* The string is validated while doing this.
*
* @returns IPRT status code.
* @param psz Pointer to the UTF-8 string.
* @param cch The max length of the string. (btw cch = cb)
* Use RTSTR_MAX if all of the string is to be examined.
* @param pcuc Where to store the length in unicode code points.
* @param pcchActual Where to store the actual size of the UTF-8 string
* on success (cch = cb again). Optional.
*/
DECLHIDDEN(int) rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
{
const unsigned char *puch = (const unsigned char *)psz;
size_t cCodePoints = 0;
while (cch > 0)
{
const unsigned char uch = *puch;
if (!uch)
break;
if (uch & RT_BIT(7))
{
/* figure sequence length and validate the first byte */
/** @todo RT_USE_RTC_3629 */
unsigned cb;
if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
cb = 2;
else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
cb = 3;
else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
cb = 4;
else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
cb = 5;
else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
cb = 6;
else
{
RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
return VERR_INVALID_UTF8_ENCODING;
}
/* check length */
if (cb > cch)
{
RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
return VERR_INVALID_UTF8_ENCODING;
}
/* validate the rest */
switch (cb)
{
case 6:
RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
case 5:
RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
case 4:
RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
case 3:
RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
case 2:
RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
break;
}
/* validate the code point. */
RTUNICP uc;
switch (cb)
{
case 6:
uc = (puch[5] & 0x3f)
| ((RTUNICP)(puch[4] & 0x3f) << 6)
| ((RTUNICP)(puch[3] & 0x3f) << 12)
| ((RTUNICP)(puch[2] & 0x3f) << 18)
| ((RTUNICP)(puch[1] & 0x3f) << 24)
| ((RTUNICP)(uch & 0x01) << 30);
RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
break;
case 5:
uc = (puch[4] & 0x3f)
| ((RTUNICP)(puch[3] & 0x3f) << 6)
| ((RTUNICP)(puch[2] & 0x3f) << 12)
| ((RTUNICP)(puch[1] & 0x3f) << 18)
| ((RTUNICP)(uch & 0x03) << 24);
RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
break;
case 4:
uc = (puch[3] & 0x3f)
| ((RTUNICP)(puch[2] & 0x3f) << 6)
| ((RTUNICP)(puch[1] & 0x3f) << 12)
| ((RTUNICP)(uch & 0x07) << 18);
RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
break;
case 3:
uc = (puch[2] & 0x3f)
| ((RTUNICP)(puch[1] & 0x3f) << 6)
| ((RTUNICP)(uch & 0x0f) << 12);
RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
break;
case 2:
uc = (puch[1] & 0x3f)
| ((RTUNICP)(uch & 0x1f) << 6);
RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
break;
}
/* advance */
cch -= cb;
puch += cb;
}
else
{
/* one ASCII byte */
puch++;
cch--;
}
cCodePoints++;
}
/* done */
*pcuc = cCodePoints;
if (pcchActual)
*pcchActual = puch - (unsigned char const *)psz;
return VINF_SUCCESS;
}
/**
* Decodes and UTF-8 string into an array of unicode code point.
*
* Since we know the input is valid, we do *not* perform encoding or length checks.
*
* @returns iprt status code.
* @param psz The UTF-8 string to recode. This is a valid encoding.
* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
* @param paCps Where to store the code points array.
* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
*/
static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
{
int rc = VINF_SUCCESS;
const unsigned char *puch = (const unsigned char *)psz;
PRTUNICP pCp = paCps;
while (cch > 0)
{
/* read the next char and check for terminator. */
const unsigned char uch = *puch;
if (!uch)
break;
/* check for output overflow */
if (RT_UNLIKELY(cCps < 1))
{
rc = VERR_BUFFER_OVERFLOW;
break;
}
cCps--;
/* decode and recode the code point */
if (!(uch & RT_BIT(7)))
{
*pCp++ = uch;
puch++;
cch--;
}
#ifdef RT_STRICT
else if (!(uch & RT_BIT(6)))
AssertMsgFailed(("Internal error!\n"));
#endif
else if (!(uch & RT_BIT(5)))
{
*pCp++ = (puch[1] & 0x3f)
| ((uint16_t)(uch & 0x1f) << 6);
puch += 2;
cch -= 2;
}
else if (!(uch & RT_BIT(4)))
{
*pCp++ = (puch[2] & 0x3f)
| ((uint16_t)(puch[1] & 0x3f) << 6)
| ((uint16_t)(uch & 0x0f) << 12);
puch += 3;
cch -= 3;
}
else if (!(uch & RT_BIT(3)))
{
*pCp++ = (puch[3] & 0x3f)
| ((RTUNICP)(puch[2] & 0x3f) << 6)
| ((RTUNICP)(puch[1] & 0x3f) << 12)
| ((RTUNICP)(uch & 0x07) << 18);
puch += 4;
cch -= 4;
}
else if (!(uch & RT_BIT(2)))
{
*pCp++ = (puch[4] & 0x3f)
| ((RTUNICP)(puch[3] & 0x3f) << 6)
| ((RTUNICP)(puch[2] & 0x3f) << 12)
| ((RTUNICP)(puch[1] & 0x3f) << 18)
| ((RTUNICP)(uch & 0x03) << 24);
puch += 5;
cch -= 6;
}
else
{
Assert(!(uch & RT_BIT(1)));
*pCp++ = (puch[5] & 0x3f)
| ((RTUNICP)(puch[4] & 0x3f) << 6)
| ((RTUNICP)(puch[3] & 0x3f) << 12)
| ((RTUNICP)(puch[2] & 0x3f) << 18)
| ((RTUNICP)(puch[1] & 0x3f) << 24)
| ((RTUNICP)(uch & 0x01) << 30);
puch += 6;
cch -= 6;
}
}
/* done */
*pCp = 0;
return rc;
}
RTDECL(size_t) RTStrUniLen(const char *psz)
{
size_t cCodePoints;
int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
return RT_SUCCESS(rc) ? cCodePoints : 0;
}
RT_EXPORT_SYMBOL(RTStrUniLen);
RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
{
size_t cCodePoints;
int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
if (pcCps)
*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
return rc;
}
RT_EXPORT_SYMBOL(RTStrUniLenEx);
RTDECL(int) RTStrValidateEncoding(const char *psz)
{
return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
}
RT_EXPORT_SYMBOL(RTStrValidateEncoding);
RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
{
AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
AssertPtr(psz);
/*
* Use rtUtf8Length for the job.
*/
size_t cchActual;
size_t cCpsIgnored;
int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
if (RT_SUCCESS(rc))
{
if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
&& cchActual >= cch)
rc = VERR_BUFFER_OVERFLOW;
}
return rc;
}
RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
RTDECL(bool) RTStrIsValidEncoding(const char *psz)
{
int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
return RT_SUCCESS(rc);
}
RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
RTDECL(size_t) RTStrPurgeEncoding(char *psz)
{
size_t cErrors = 0;
for (;;)
{
RTUNICP Cp;
int rc = RTStrGetCpEx((const char **)&psz, &Cp);
if (RT_SUCCESS(rc))
{
if (!Cp)
break;
}
else
{
psz[-1] = '?';
cErrors++;
}
}
return cErrors;
}
RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidSet, char chReplacement)
{
size_t cReplacements = 0;
AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
for (;;)
{
RTUNICP Cp;
PCRTUNICP pCp;
char *pszOld = psz;
if (RT_FAILURE(RTStrGetCpEx((const char **)&psz, &Cp)))
return -1;
if (!Cp)
break;
for (pCp = puszValidSet; *pCp; pCp += 2)
{
AssertReturn(*(pCp + 1), -1);
if (*pCp <= Cp && *(pCp + 1) >= Cp) /* No, I won't do * and ++. */
break;
}
if (!*pCp)
{
for (; pszOld != psz; ++pszOld)
*pszOld = chReplacement;
++cReplacements;
}
}
return cReplacements;
}
RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
{
/*
* Validate input.
*/
Assert(VALID_PTR(pszString));
Assert(VALID_PTR(ppaCps));
*ppaCps = NULL;
/*
* Validate the UTF-8 input and count its code points.
*/
size_t cCps;
int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
if (RT_SUCCESS(rc))
{
/*
* Allocate buffer.
*/
PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
if (paCps)
{
/*
* Decode the string.
*/
rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
if (RT_SUCCESS(rc))
{
*ppaCps = paCps;
return rc;
}
RTMemFree(paCps);
}
else
rc = VERR_NO_CODE_POINT_MEMORY;
}
return rc;
}
RT_EXPORT_SYMBOL(RTStrToUni);
RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
{
/*
* Validate input.
*/
Assert(VALID_PTR(pszString));
Assert(VALID_PTR(ppaCps));
Assert(!pcCps || VALID_PTR(pcCps));
/*
* Validate the UTF-8 input and count the code points.
*/
size_t cCpsResult;
int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
if (RT_SUCCESS(rc))
{
if (pcCps)
*pcCps = cCpsResult;
/*
* Check buffer size / Allocate buffer.
*/
bool fShouldFree;
PRTUNICP paCpsResult;
if (cCps > 0 && *ppaCps)
{
fShouldFree = false;
if (cCps <= cCpsResult)
return VERR_BUFFER_OVERFLOW;
paCpsResult = *ppaCps;
}
else
{
*ppaCps = NULL;
fShouldFree = true;
cCps = RT_MAX(cCpsResult + 1, cCps);
paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
}
if (paCpsResult)
{
/*
* Encode the UTF-16 string.
*/
rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
if (RT_SUCCESS(rc))
{
*ppaCps = paCpsResult;
return rc;
}
if (fShouldFree)
RTMemFree(paCpsResult);
}
else
rc = VERR_NO_CODE_POINT_MEMORY;
}
return rc;
}
RT_EXPORT_SYMBOL(RTStrToUniEx);
/**
* Calculates the UTF-16 length of a string, validating the encoding while doing so.
*
* @returns IPRT status code.
* @param psz Pointer to the UTF-8 string.
* @param cch The max length of the string. (btw cch = cb)
* Use RTSTR_MAX if all of the string is to be examined.
* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
*/
static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
{
const unsigned char *puch = (const unsigned char *)psz;
size_t cwc = 0;
while (cch > 0)
{
const unsigned char uch = *puch;
if (!uch)
break;
if (!(uch & RT_BIT(7)))
{
/* one ASCII byte */
cwc++;
puch++;
cch--;
}
else
{
/* figure sequence length and validate the first byte */
unsigned cb;
if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
cb = 2;
else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
cb = 3;
else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
cb = 4;
else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
cb = 5;
else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
cb = 6;
else
{
RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
return VERR_INVALID_UTF8_ENCODING;
}
/* check length */
if (cb > cch)
{
RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
return VERR_INVALID_UTF8_ENCODING;
}
/* validate the rest */
switch (cb)
{
case 6:
RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
case 5:
RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
case 4:
RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
case 3:
RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
case 2:
RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
break;
}
/* validate the code point. */
RTUNICP uc;
switch (cb)
{
case 6:
uc = (puch[5] & 0x3f)
| ((RTUNICP)(puch[4] & 0x3f) << 6)
| ((RTUNICP)(puch[3] & 0x3f) << 12)
| ((RTUNICP)(puch[2] & 0x3f) << 18)
| ((RTUNICP)(puch[1] & 0x3f) << 24)
| ((RTUNICP)(uch & 0x01) << 30);
RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
return VERR_CANT_RECODE_AS_UTF16;
case 5:
uc = (puch[4] & 0x3f)
| ((RTUNICP)(puch[3] & 0x3f) << 6)
| ((RTUNICP)(puch[2] & 0x3f) << 12)
| ((RTUNICP)(puch[1] & 0x3f) << 18)
| ((RTUNICP)(uch & 0x03) << 24);
RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
return VERR_CANT_RECODE_AS_UTF16;
case 4:
uc = (puch[3] & 0x3f)
| ((RTUNICP)(puch[2] & 0x3f) << 6)
| ((RTUNICP)(puch[1] & 0x3f) << 12)
| ((RTUNICP)(uch & 0x07) << 18);
RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
RTStrAssertMsgReturn(uc <= 0x0010ffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
cwc++;
break;
case 3:
uc = (puch[2] & 0x3f)
| ((RTUNICP)(puch[1] & 0x3f) << 6)
| ((RTUNICP)(uch & 0x0f) << 12);
RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
break;
case 2:
uc = (puch[1] & 0x3f)
| ((RTUNICP)(uch & 0x1f) << 6);
RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
break;
}
/* advance */
cch -= cb;
puch += cb;
cwc++;
}
}
/* done */
*pcwc = cwc;
return VINF_SUCCESS;
}
/**
* Recodes a valid UTF-8 string as UTF-16.
*
* Since we know the input is valid, we do *not* perform encoding or length checks.
*
* @returns iprt status code.
* @param psz The UTF-8 string to recode. This is a valid encoding.
* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
* @param pwsz Where to store the UTF-16 string.
* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
*/
static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
{
int rc = VINF_SUCCESS;
const unsigned char *puch = (const unsigned char *)psz;
PRTUTF16 pwc = pwsz;
while (cch > 0)
{
/* read the next char and check for terminator. */
const unsigned char uch = *puch;
if (!uch)
break;
/* check for output overflow */
if (RT_UNLIKELY(cwc < 1))
{
rc = VERR_BUFFER_OVERFLOW;
break;
}
cwc--;
/* decode and recode the code point */
if (!(uch & RT_BIT(7)))
{
*pwc++ = uch;
puch++;
cch--;
}
else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
{
uint16_t uc = (puch[1] & 0x3f)
| ((uint16_t)(uch & 0x1f) << 6);
*pwc++ = uc;
puch += 2;
cch -= 2;
}
else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
{
uint16_t uc = (puch[2] & 0x3f)
| ((uint16_t)(puch[1] & 0x3f) << 6)
| ((uint16_t)(uch & 0x0f) << 12);
*pwc++ = uc;
puch += 3;
cch -= 3;
}
else
{
/* generate surrogate pair */
Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
RTUNICP uc = (puch[3] & 0x3f)
| ((RTUNICP)(puch[2] & 0x3f) << 6)
| ((RTUNICP)(puch[1] & 0x3f) << 12)
| ((RTUNICP)(uch & 0x07) << 18);
if (RT_UNLIKELY(cwc < 1))
{
rc = VERR_BUFFER_OVERFLOW;
break;
}
cwc--;
uc -= 0x10000;
*pwc++ = 0xd800 | (uc >> 10);
*pwc++ = 0xdc00 | (uc & 0x3ff);
puch += 4;
cch -= 4;
}
}
/* done */
*pwc = '\0';
return rc;
}
RTDECL(int) RTStrToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
{
/*
* Validate input.
*/
Assert(VALID_PTR(ppwszString));
Assert(VALID_PTR(pszString));
*ppwszString = NULL;
/*
* Validate the UTF-8 input and calculate the length of the UTF-16 string.
*/
size_t cwc;
int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
if (RT_SUCCESS(rc))
{
/*
* Allocate buffer.
*/
PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
if (pwsz)
{
/*
* Encode the UTF-16 string.
*/
rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
if (RT_SUCCESS(rc))
{
*ppwszString = pwsz;
return rc;
}
RTMemFree(pwsz);
}
else
rc = VERR_NO_UTF16_MEMORY;
}
return rc;
}
RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
{
/*
* Validate input.
*/
Assert(VALID_PTR(pszString));
Assert(VALID_PTR(ppwsz));
Assert(!pcwc || VALID_PTR(pcwc));
/*
* Validate the UTF-8 input and calculate the length of the UTF-16 string.
*/
size_t cwcResult;
int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
if (RT_SUCCESS(rc))
{
if (pcwc)
*pcwc = cwcResult;
/*
* Check buffer size / Allocate buffer.
*/
bool fShouldFree;
PRTUTF16 pwszResult;
if (cwc > 0 && *ppwsz)
{
fShouldFree = false;
if (cwc <= cwcResult)
return VERR_BUFFER_OVERFLOW;
pwszResult = *ppwsz;
}
else
{
*ppwsz = NULL;
fShouldFree = true;
cwc = RT_MAX(cwcResult + 1, cwc);
pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
}
if (pwszResult)
{
/*
* Encode the UTF-16 string.
*/
rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
if (RT_SUCCESS(rc))
{
*ppwsz = pwszResult;
return rc;
}
if (fShouldFree)
RTMemFree(pwszResult);
}
else
rc = VERR_NO_UTF16_MEMORY;
}
return rc;
}
RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
{
size_t cwc;
int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
return RT_SUCCESS(rc) ? cwc : 0;
}
RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
{
size_t cwc;
int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
if (pcwc)
*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
return rc;
}
RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
/**
* Calculates the length of the UTF-8 encoding of a Latin-1 string.
*
* @returns iprt status code.
* @param psz The Latin-1 string.
* @param cchIn The max length of the Latin-1 string to consider.
* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
*/
static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch)
{
size_t cch = 0;
for (;;)
{
RTUNICP Cp;
int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
if (Cp == 0 || rc == VERR_END_OF_STRING)
break;
if (RT_FAILURE(rc))
return rc;
cch += RTStrCpSize(Cp); /* cannot fail */
}
/* done */
*pcch = cch;
return VINF_SUCCESS;
}
/**
* Recodes a Latin-1 string as UTF-8.
*
* @returns iprt status code.
* @param psz The Latin-1 string.
* @param cchIn The number of characters to process from psz. The recoding
* will stop when cch or '\\0' is reached.
* @param psz Where to store the UTF-8 string.
* @param cch The size of the UTF-8 buffer, excluding the terminator.
*/
static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
{
int rc;
for (;;)
{
RTUNICP Cp;
size_t cchCp;
rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
if (Cp == 0 || RT_FAILURE(rc))
break;
cchCp = RTStrCpSize(Cp);
if (RT_UNLIKELY(cch < cchCp))
{
RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
rc = VERR_BUFFER_OVERFLOW;
break;
}
cch -= cchCp;
psz = RTStrPutCp(psz, Cp);
}
/* done */
if (rc == VERR_END_OF_STRING)
rc = VINF_SUCCESS;
*psz = '\0';
return rc;
}
RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag)
{
/*
* Validate input.
*/
Assert(VALID_PTR(ppszString));
Assert(VALID_PTR(pszString));
*ppszString = NULL;
/*
* Calculate the length of the UTF-8 encoding of the Latin-1 string.
*/
size_t cch;
int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
if (RT_SUCCESS(rc))
{
/*
* Allocate buffer and recode it.
*/
char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
if (pszResult)
{
rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
if (RT_SUCCESS(rc))
{
*ppszString = pszResult;
return rc;
}
RTMemFree(pszResult);
}
else
rc = VERR_NO_STR_MEMORY;
}
return rc;
}
RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
{
/*
* Validate input.
*/
Assert(VALID_PTR(pszString));
Assert(VALID_PTR(ppsz));
Assert(!pcch || VALID_PTR(pcch));
/*
* Calculate the length of the UTF-8 encoding of the Latin-1 string.
*/
size_t cchResult;
int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
if (RT_SUCCESS(rc))
{
if (pcch)
*pcch = cchResult;
/*
* Check buffer size / Allocate buffer and recode it.
*/
bool fShouldFree;
char *pszResult;
if (cch > 0 && *ppsz)
{
fShouldFree = false;
if (RT_UNLIKELY(cch <= cchResult))
return VERR_BUFFER_OVERFLOW;
pszResult = *ppsz;
}
else
{
*ppsz = NULL;
fShouldFree = true;
cch = RT_MAX(cch, cchResult + 1);
pszResult = (char *)RTStrAllocTag(cch, pszTag);
}
if (pszResult)
{
rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
if (RT_SUCCESS(rc))
{
*ppsz = pszResult;
return rc;
}
if (fShouldFree)
RTStrFree(pszResult);
}
else
rc = VERR_NO_STR_MEMORY;
}
return rc;
}
RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
{
size_t cch;
int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
return RT_SUCCESS(rc) ? cch : 0;
}
RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch)
{
size_t cch;
int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
if (pcch)
*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
return rc;
}
RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
/**
* Calculates the Latin-1 length of a string, validating the encoding while
* doing so.
*
* @returns IPRT status code.
* @param psz Pointer to the UTF-8 string.
* @param cchIn The max length of the string. (btw cch = cb)
* Use RTSTR_MAX if all of the string is to be examined.
* @param pcch Where to store the length of the Latin-1 string in bytes.
*/
static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
{
size_t cch = 0;
for (;;)
{
RTUNICP Cp;
size_t cchCp;
int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
if (Cp == 0 || rc == VERR_END_OF_STRING)
break;
if (RT_FAILURE(rc))
return rc;
cchCp = RTLatin1CpSize(Cp);
if (cchCp == 0)
return VERR_NO_TRANSLATION;
cch += cchCp;
}
/* done */
*pcch = cch;
return VINF_SUCCESS;
}
/**
* Recodes a valid UTF-8 string as Latin-1.
*
* Since we know the input is valid, we do *not* perform encoding or length checks.
*
* @returns iprt status code.
* @param pszIn The UTF-8 string to recode. This is a valid encoding.
* @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
* @param psz Where to store the Latin-1 string.
* @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
*/
static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
{
int rc;
for (;;)
{
RTUNICP Cp;
size_t cchCp;
rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
if (Cp == 0 || RT_FAILURE(rc))
break;
cchCp = RTLatin1CpSize(Cp);
if (RT_UNLIKELY(cch < cchCp))
{
RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
rc = VERR_BUFFER_OVERFLOW;
break;
}
cch -= cchCp;
psz = RTLatin1PutCp(psz, Cp);
}
/* done */
if (rc == VERR_END_OF_STRING)
rc = VINF_SUCCESS;
*psz = '\0';
return rc;
}
RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag)
{
/*
* Validate input.
*/
Assert(VALID_PTR(ppszString));
Assert(VALID_PTR(pszString));
*ppszString = NULL;
/*
* Validate the UTF-8 input and calculate the length of the Latin-1 string.
*/
size_t cch;
int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
if (RT_SUCCESS(rc))
{
/*
* Allocate buffer.
*/
char *psz = (char *)RTMemAllocTag(cch + 1, pszTag);
if (psz)
{
/*
* Encode the UTF-16 string.
*/
rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
if (RT_SUCCESS(rc))
{
*ppszString = psz;
return rc;
}
RTMemFree(psz);
}
else
rc = VERR_NO_STR_MEMORY;
}
return rc;
}
RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
{
/*
* Validate input.
*/
Assert(VALID_PTR(pszString));
Assert(VALID_PTR(ppsz));
Assert(!pcch || VALID_PTR(pcch));
/*
* Validate the UTF-8 input and calculate the length of the UTF-16 string.
*/
size_t cchResult;
int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
if (RT_SUCCESS(rc))
{
if (pcch)
*pcch = cchResult;
/*
* Check buffer size / Allocate buffer.
*/
bool fShouldFree;
char *pszResult;
if (cch > 0 && *ppsz)
{
fShouldFree = false;
if (cch <= cchResult)
return VERR_BUFFER_OVERFLOW;
pszResult = *ppsz;
}
else
{
*ppsz = NULL;
fShouldFree = true;
cch = RT_MAX(cchResult + 1, cch);
pszResult = (char *)RTMemAllocTag(cch, pszTag);
}
if (pszResult)
{
/*
* Encode the Latin-1 string.
*/
rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
if (RT_SUCCESS(rc))
{
*ppsz = pszResult;
return rc;
}
if (fShouldFree)
RTMemFree(pszResult);
}
else
rc = VERR_NO_STR_MEMORY;
}
return rc;
}
RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
{
size_t cch;
int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
return RT_SUCCESS(rc) ? cch : 0;
}
RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cchIn, size_t *pcch)
{
size_t cch;
int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
if (pcch)
*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
return rc;
}
RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
/**
* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
* @returns rc
* @param ppsz The pointer to the string position point.
* @param pCp Where to store RTUNICP_INVALID.
* @param rc The iprt error code.
*/
static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
{
/*
* Try find a valid encoding.
*/
(*ppsz)++; /** @todo code this! */
*pCp = RTUNICP_INVALID;
return rc;
}
RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
{
RTUNICP Cp;
RTStrGetCpExInternal(&psz, &Cp);
return Cp;
}
RT_EXPORT_SYMBOL(RTStrGetCpInternal);
RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
{
const unsigned char *puch = (const unsigned char *)*ppsz;
const unsigned char uch = *puch;
RTUNICP uc;
/* ASCII ? */
if (!(uch & RT_BIT(7)))
{
uc = uch;
puch++;
}
else if (uch & RT_BIT(6))
{
/* figure the length and validate the first octet. */
/** @todo RT_USE_RTC_3629 */
unsigned cb;
if (!(uch & RT_BIT(5)))
cb = 2;
else if (!(uch & RT_BIT(4)))
cb = 3;
else if (!(uch & RT_BIT(3)))
cb = 4;
else if (!(uch & RT_BIT(2)))
cb = 5;
else if (!(uch & RT_BIT(1)))
cb = 6;
else
{
RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
}
/* validate the rest */
switch (cb)
{
case 6:
RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
case 5:
RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
case 4:
RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
case 3:
RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
case 2:
RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
break;
}
/* get and validate the code point. */
switch (cb)
{
case 6:
uc = (puch[5] & 0x3f)
| ((RTUNICP)(puch[4] & 0x3f) << 6)
| ((RTUNICP)(puch[3] & 0x3f) << 12)
| ((RTUNICP)(puch[2] & 0x3f) << 18)
| ((RTUNICP)(puch[1] & 0x3f) << 24)
| ((RTUNICP)(uch & 0x01) << 30);
RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
break;
case 5:
uc = (puch[4] & 0x3f)
| ((RTUNICP)(puch[3] & 0x3f) << 6)
| ((RTUNICP)(puch[2] & 0x3f) << 12)
| ((RTUNICP)(puch[1] & 0x3f) << 18)
| ((RTUNICP)(uch & 0x03) << 24);
RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
break;
case 4:
uc = (puch[3] & 0x3f)
| ((RTUNICP)(puch[2] & 0x3f) << 6)
| ((RTUNICP)(puch[1] & 0x3f) << 12)
| ((RTUNICP)(uch & 0x07) << 18);
RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
break;
case 3:
uc = (puch[2] & 0x3f)
| ((RTUNICP)(puch[1] & 0x3f) << 6)
| ((RTUNICP)(uch & 0x0f) << 12);
RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
break;
case 2:
uc = (puch[1] & 0x3f)
| ((RTUNICP)(uch & 0x1f) << 6);
RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
break;
default: /* impossible, but GCC is bitching. */
uc = RTUNICP_INVALID;
break;
}
puch += cb;
}
else
{
/* 6th bit is always set. */
RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
}
*pCp = uc;
*ppsz = (const char *)puch;
return VINF_SUCCESS;
}
RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
/**
* Handle invalid encodings passed to RTStrGetCpNEx().
* @returns rc
* @param ppsz The pointer to the string position point.
* @param pcch Pointer to the string length.
* @param pCp Where to store RTUNICP_INVALID.
* @param rc The iprt error code.
*/
static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
{
/*
* Try find a valid encoding.
*/
(*ppsz)++; /** @todo code this! */
(*pcch)--;
*pCp = RTUNICP_INVALID;
return rc;
}
RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
{
const unsigned char *puch = (const unsigned char *)*ppsz;
const unsigned char uch = *puch;
size_t cch = *pcch;
RTUNICP uc;
if (cch == 0)
{
*pCp = RTUNICP_INVALID;
return VERR_END_OF_STRING;
}
/* ASCII ? */
if (!(uch & RT_BIT(7)))
{
uc = uch;
puch++;
cch--;
}
else if (uch & RT_BIT(6))
{
/* figure the length and validate the first octet. */
/** @todo RT_USE_RTC_3629 */
unsigned cb;
if (!(uch & RT_BIT(5)))
cb = 2;
else if (!(uch & RT_BIT(4)))
cb = 3;
else if (!(uch & RT_BIT(3)))
cb = 4;
else if (!(uch & RT_BIT(2)))
cb = 5;
else if (!(uch & RT_BIT(1)))
cb = 6;
else
{
RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
}
if (cb > cch)
return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
/* validate the rest */
switch (cb)
{
case 6:
RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
case 5:
RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
case 4:
RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
case 3:
RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
case 2:
RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
break;
}
/* get and validate the code point. */
switch (cb)
{
case 6:
uc = (puch[5] & 0x3f)
| ((RTUNICP)(puch[4] & 0x3f) << 6)
| ((RTUNICP)(puch[3] & 0x3f) << 12)
| ((RTUNICP)(puch[2] & 0x3f) << 18)
| ((RTUNICP)(puch[1] & 0x3f) << 24)
| ((RTUNICP)(uch & 0x01) << 30);
RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
break;
case 5:
uc = (puch[4] & 0x3f)
| ((RTUNICP)(puch[3] & 0x3f) << 6)
| ((RTUNICP)(puch[2] & 0x3f) << 12)
| ((RTUNICP)(puch[1] & 0x3f) << 18)
| ((RTUNICP)(uch & 0x03) << 24);
RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
break;
case 4:
uc = (puch[3] & 0x3f)
| ((RTUNICP)(puch[2] & 0x3f) << 6)
| ((RTUNICP)(puch[1] & 0x3f) << 12)
| ((RTUNICP)(uch & 0x07) << 18);
RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
break;
case 3:
uc = (puch[2] & 0x3f)
| ((RTUNICP)(puch[1] & 0x3f) << 6)
| ((RTUNICP)(uch & 0x0f) << 12);
RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
break;
case 2:
uc = (puch[1] & 0x3f)
| ((RTUNICP)(uch & 0x1f) << 6);
RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
break;
default: /* impossible, but GCC is bitching. */
uc = RTUNICP_INVALID;
break;
}
puch += cb;
cch -= cb;
}
else
{
/* 6th bit is always set. */
RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
}
*pCp = uc;
*ppsz = (const char *)puch;
(*pcch) = cch;
return VINF_SUCCESS;
}
RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
{
unsigned char *puch = (unsigned char *)psz;
if (uc < 0x80)
*puch++ = (unsigned char )uc;
else if (uc < 0x00000800)
{
*puch++ = 0xc0 | (uc >> 6);
*puch++ = 0x80 | (uc & 0x3f);
}
else if (uc < 0x00010000)
{
/** @todo RT_USE_RTC_3629 */
if ( uc < 0x0000d8000
|| ( uc > 0x0000dfff
&& uc < 0x0000fffe))
{
*puch++ = 0xe0 | (uc >> 12);
*puch++ = 0x80 | ((uc >> 6) & 0x3f);
*puch++ = 0x80 | (uc & 0x3f);
}
else
{
AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
*puch++ = 0x7f;
}
}
/** @todo RT_USE_RTC_3629 */
else if (uc < 0x00200000)
{
*puch++ = 0xf0 | (uc >> 18);
*puch++ = 0x80 | ((uc >> 12) & 0x3f);
*puch++ = 0x80 | ((uc >> 6) & 0x3f);
*puch++ = 0x80 | (uc & 0x3f);
}
else if (uc < 0x04000000)
{
*puch++ = 0xf8 | (uc >> 24);
*puch++ = 0x80 | ((uc >> 18) & 0x3f);
*puch++ = 0x80 | ((uc >> 12) & 0x3f);
*puch++ = 0x80 | ((uc >> 6) & 0x3f);
*puch++ = 0x80 | (uc & 0x3f);
}
else if (uc <= 0x7fffffff)
{
*puch++ = 0xfc | (uc >> 30);
*puch++ = 0x80 | ((uc >> 24) & 0x3f);
*puch++ = 0x80 | ((uc >> 18) & 0x3f);
*puch++ = 0x80 | ((uc >> 12) & 0x3f);
*puch++ = 0x80 | ((uc >> 6) & 0x3f);
*puch++ = 0x80 | (uc & 0x3f);
}
else
{
AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
*puch++ = 0x7f;
}
return (char *)puch;
}
RT_EXPORT_SYMBOL(RTStrPutCpInternal);
RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
{
if (pszStart < psz)
{
/* simple char? */
const unsigned char *puch = (const unsigned char *)psz;
unsigned uch = *--puch;
if (!(uch & RT_BIT(7)))
return (char *)puch;
RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
/* two or more. */
uint32_t uMask = 0xffffffc0;
while ( (const unsigned char *)pszStart < puch
&& !(uMask & 1))
{
uch = *--puch;
if ((uch & 0xc0) != 0x80)
{
RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
(char *)pszStart);
return (char *)puch;
}
uMask >>= 1;
}
RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
}
return (char *)pszStart;
}
RT_EXPORT_SYMBOL(RTStrPrevCp);