utf8.c revision 4297a3b0d0a35d80f86fff155e288e885a100e6d
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Copyright (c) 2002-2004 Tim J. Robbins
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * All rights reserved.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Redistribution and use in source and binary forms, with or without
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * modification, are permitted provided that the following conditions
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * 1. Redistributions of source code must retain the above copyright
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * notice, this list of conditions and the following disclaimer.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * 2. Redistributions in binary form must reproduce the above copyright
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * notice, this list of conditions and the following disclaimer in the
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * documentation and/or other materials provided with the distribution.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * SUCH DAMAGE.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Use is subject to license terms.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amorestatic size_t _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amorestatic size_t _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore const char **_RESTRICT_KYWD, size_t, size_t,
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amorestatic size_t _UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amorestatic size_t _UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amoretypedef struct {
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * In theory up to 6 bytes can be used for the encoding,
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * but only encodings with more than 4 bytes are illegal.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Note that the other CSWIDTH members are nonsensical for this
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * this coding. They only are valid with EUC codings.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore_UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* Incomplete multibyte sequence */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* Fast path for plain ASCII characters. */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Determine the number of octets that make up this character
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * from the first octet, and a mask that extracts the
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * interesting bits of the first octet. We already know
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * the character is at least two bytes long.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * We also specify a lower bound for the character code to
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * detect redundant, non-"shortest form" encodings. For
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * example, the sequence C0 80 is _not_ a legal representation
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * of the null character. This enforces a 1-to-1 mapping
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * between character codes and their multibyte representations.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore ch = (unsigned char)*s;
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* These would be illegal in the UTF-8 space */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Malformed input; input is not UTF-8.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Decode the octet sequence representing the character in chunks
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * of 6 bits, most significant first.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Malformed input; bad characters in the middle
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * of a character.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* Incomplete multibyte sequence. */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Malformed input; redundant encoding.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore_UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore const char *s;
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * The fast path in the loop below is not safe if an ASCII
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * character appears as anything but the first byte of a
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * multibyte sequence. Check now to avoid doing it in the loop.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore if (nms > 0 && (signed char)*s > 0)
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Fast path for plain ASCII characters
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * excluding NUL.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* Invalid sequence - mbrtowc() sets errno. */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /*NOTREACHED*/
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * The fast path in the loop below is not safe if an ASCII
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * character appears as anything but the first byte of a
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * multibyte sequence. Check now to avoid doing it in the loop.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore while (len-- > 0) {
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore if (nms > 0 && (signed char)*s > 0) {
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Fast path for plain ASCII characters
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * excluding NUL.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore } else if (nb == 0) {
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore_UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore unsigned char lead;
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* Reset to initial shift state (no-op) */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* Fast path for plain ASCII characters. */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Determine the number of octets needed to represent this character.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * We always output the shortest sequence possible. Also specify the
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * first few bits of the first octet, which contains the information
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * about the sequence length.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* Again, 5 and 6 byte encodings are simply not permitted */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * Output the octets representing the character in chunks
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * of 6 bits, least significant last. The first octet is
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * a special case because it contains the sequence length
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * information.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore_UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore while (nwc-- > 0) {
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore if (0 <= *s && *s < 0x80)
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* Fast path for plain ASCII characters. */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* Invalid character - wcrtomb() sets errno. */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore if (*s == L'\0')
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore if (0 <= *s && *s < 0x80) {
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* Fast path for plain ASCII characters. */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* Enough space to translate in-place. */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore * May not be enough space; use temp. buffer.
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore /* MB sequence for character won't fit. */
4297a3b0d0a35d80f86fff155e288e885a100e6dGarrett D'Amore if (*s == L'\0') {