/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
* (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
* Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
* uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
* the section 3C man pages.
* Interface stability: Committed
*/
#ifdef _KERNEL
#include <sys/sysmacros.h>
#else
#include <sys/u8_textprep.h>
#endif /* _KERNEL */
#include <sys/byteorder.h>
/*
* The max and min values of high and low surrogate pairs of UTF-16,
* UTF-16 bit shift value, bit mask, and starting value outside of BMP.
*/
/* The maximum value of Unicode coding space and ASCII coding space. */
/* The mask values for input and output endians. */
/* Native and reversed endian macros. */
#ifdef _BIG_ENDIAN
#else
#endif /* _BIG_ENDIAN */
/* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
/* UTF-32 boundaries based on UTF-8 character byte lengths. */
/* The common minimum and maximum values at the UTF-8 character bytes. */
/*
* The following "6" and "0x3f" came from "10xx xxxx" bit representation of
* UTF-8 character bytes.
*/
/*
* The following vector shows remaining bytes in a UTF-8 character.
* Index will be the first byte of the character.
*/
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/*
* The following is a vector of bit-masks to get used bits in
* the first byte of a UTF-8 character. Index is remaining bytes at above of
* the character.
*/
#ifdef _KERNEL
#else
#endif /* _KERNEL */
/*
* The following two vectors are to provide valid minimum and
* maximum values for the 2'nd byte of a multibyte UTF-8 character for
* better illegal sequence checking. The index value must be the value of
* the first byte of the UTF-8 character.
*/
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* C0 C1 C2 C3 C4 C5 C6 C7 */
0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
/* C8 C9 CA CB CC CD CE CF */
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
/* D0 D1 D2 D3 D4 D5 D6 D7 */
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
/* D8 D9 DA DB DC DD DE DF */
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
/* E0 E1 E2 E3 E4 E5 E6 E7 */
0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
/* E8 E9 EA EB EC ED EE EF */
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
/* F0 F1 F2 F3 F4 F5 F6 F7 */
0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0
};
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* C0 C1 C2 C3 C4 C5 C6 C7 */
0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
/* C8 C9 CA CB CC CD CE CF */
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
/* D0 D1 D2 D3 D4 D5 D6 D7 */
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
/* D8 D9 DA DB DC DD DE DF */
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
/* E0 E1 E2 E3 E4 E5 E6 E7 */
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
/* E8 E9 EA EB EC ED EE EF */
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
/* F0 F1 F2 F3 F4 F5 F6 F7 */
0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0
};
static int
{
/* You cannot have both. */
if (*in == UCONV_IN_ENDIAN_MASKS)
return (EBADF);
if (*in == 0)
/* You cannot have both. */
if (*out == UCONV_OUT_ENDIAN_MASKS)
return (EBADF);
if (*out == 0)
return (0);
}
static boolean_t
{
if (u16l > 0) {
if (*u16s == UCONV_BOM_NORMAL) {
return (B_TRUE);
}
if (*u16s == UCONV_BOM_SWAPPED) {
return (B_TRUE);
}
}
return (B_FALSE);
}
static boolean_t
{
if (u32l > 0) {
if (*u32s == UCONV_BOM_NORMAL) {
return (B_TRUE);
}
if (*u32s == UCONV_BOM_SWAPPED_32) {
return (B_TRUE);
}
}
return (B_FALSE);
}
int
{
int inendian;
int outendian;
/*
* Do preliminary validity checks on parameters and collect info on
* endians.
*/
return (EILSEQ);
return (E2BIG);
return (EBADF);
/*
* Initialize input and output parameter buffer indices and
* temporary variables.
*/
hi = 0;
/*
* Check on the BOM at the beginning of the input buffer if required
* and if there is indeed one, process it.
*/
if ((flag & UCONV_IN_ACCEPT_BOM) &&
u16l++;
/*
* Reset inendian and outendian so that after this point, those can be
* used as condition values.
*/
/*
* If there is something in the input buffer and if necessary and
* requested, save the BOM at the output buffer.
*/
/*
* Do conversion; if encounter a surrogate pair, assemble high and
* low pair values to form a UTF-32 character. If a half of a pair
* exists alone, then, either it is an illegal (EILSEQ) or
* invalid (EINVAL) value.
*/
break;
if (hi)
return (EILSEQ);
continue;
if (! hi)
return (EILSEQ);
hi = 0;
} else if (hi) {
return (EILSEQ);
}
return (E2BIG);
}
/*
* If high half didn't see low half, then, it's most likely the input
* parameter is incomplete.
*/
if (hi)
return (EINVAL);
/*
* Save the number of consumed and saved characters. They do not
* include terminating NULL character (U+0000) at the end of
* the input buffer (even when UCONV_IGNORE_NULL isn't specified and
* the input buffer length is big enough to include the terminating
* NULL character).
*/
return (0);
}
int
{
int inendian;
int outendian;
return (EILSEQ);
return (E2BIG);
return (EBADF);
hi = 0;
if ((flag & UCONV_IN_ACCEPT_BOM) &&
u16l++;
break;
if (hi)
return (EILSEQ);
continue;
if (! hi)
return (EILSEQ);
hi = 0;
} else if (hi) {
return (EILSEQ);
}
/*
* Now we convert a UTF-32 character into a UTF-8 character.
* Unicode coding space is between U+0000 and U+10FFFF;
* anything bigger is an illegal character.
*/
if (lo <= UCONV_U8_ONE_BYTE) {
return (E2BIG);
} else if (lo <= UCONV_U8_TWO_BYTES) {
return (E2BIG);
} else if (lo <= UCONV_U8_THREE_BYTES) {
return (E2BIG);
} else if (lo <= UCONV_U8_FOUR_BYTES) {
return (E2BIG);
} else {
return (EILSEQ);
}
}
if (hi)
return (EINVAL);
return (0);
}
int
{
int inendian;
int outendian;
return (EILSEQ);
return (E2BIG);
return (EBADF);
if ((flag & UCONV_IN_ACCEPT_BOM) &&
u32l++;
break;
/*
* Anything bigger than the Unicode coding space, i.e.,
* Unicode scalar value bigger than U+10FFFF, is an illegal
* character.
*/
if (hi > UCONV_UNICODE_MAX)
return (EILSEQ);
/*
* Anything bigger than U+FFFF must be converted into
* a surrogate pair in UTF-16.
*/
if (hi >= UCONV_U16_START) {
return (E2BIG);
if (outendian) {
} else {
}
} else {
return (E2BIG);
}
}
return (0);
}
int
{
int inendian;
int outendian;
return (EILSEQ);
return (E2BIG);
return (EBADF);
if ((flag & UCONV_IN_ACCEPT_BOM) &&
u32l++;
break;
if (lo <= UCONV_U8_ONE_BYTE) {
return (E2BIG);
} else if (lo <= UCONV_U8_TWO_BYTES) {
return (E2BIG);
} else if (lo <= UCONV_U8_THREE_BYTES) {
return (E2BIG);
} else if (lo <= UCONV_U8_FOUR_BYTES) {
return (E2BIG);
} else {
return (EILSEQ);
}
}
return (0);
}
int
{
int inendian;
int outendian;
int remaining_bytes;
int first_b;
return (EILSEQ);
return (E2BIG);
return (EBADF);
break;
/*
* Collect a UTF-8 character and convert it to a UTF-32
* character. In doing so, we screen out illegally formed
* UTF-8 characters and treat such as illegal characters.
* The algorithm at below also screens out anything bigger
* than the U+10FFFF.
*
* See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
* more details on the illegal values of UTF-8 character
* bytes.
*/
if (hi > UCONV_ASCII_MAX) {
return (EILSEQ);
for (; remaining_bytes > 0; remaining_bytes--) {
/*
* If we have no more bytes, the current
* UTF-8 character is incomplete.
*/
return (EINVAL);
if (first_b) {
return (EILSEQ);
first_b = 0;
} else if (lo < UCONV_U8_BYTE_MIN ||
lo > UCONV_U8_BYTE_MAX) {
return (EILSEQ);
}
(lo & UCONV_U8_BIT_MASK);
}
}
if (hi >= UCONV_U16_START) {
return (E2BIG);
if (outendian) {
} else {
}
} else {
return (E2BIG);
}
}
return (0);
}
int
{
int inendian;
int outendian;
uint32_t c;
int remaining_bytes;
int first_b;
return (EILSEQ);
return (E2BIG);
return (EBADF);
break;
if (hi > UCONV_ASCII_MAX) {
return (EILSEQ);
for (; remaining_bytes > 0; remaining_bytes--) {
return (EINVAL);
if (first_b) {
if (c < valid_min_2nd_byte[first_b] ||
c > valid_max_2nd_byte[first_b])
return (EILSEQ);
first_b = 0;
} else if (c < UCONV_U8_BYTE_MIN ||
c > UCONV_U8_BYTE_MAX) {
return (EILSEQ);
}
(c & UCONV_U8_BIT_MASK);
}
}
return (E2BIG);
}
return (0);
}