mbrtowc.c revision 2
2N/A/* Convert multibyte character to wide character. 2N/A Copyright (C) 1999-2002, 2005-2010 Free Software Foundation, Inc. 2N/A Written by Bruno Haible <bruno@clisp.org>, 2008. 2N/A This program is free software: you can redistribute it and/or modify 2N/A it under the terms of the GNU General Public License as published by 2N/A the Free Software Foundation; either version 3 of the License, or 2N/A (at your option) any later version. 2N/A This program is distributed in the hope that it will be useful, 2N/A but WITHOUT ANY WARRANTY; without even the implied warranty of 2N/A MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 2N/A GNU General Public License for more details. 2N/A You should have received a copy of the GNU General Public License 2N/A/* Implement mbrtowc() on top of mbtowc(). */ 2N/A if (n >=
2 && m <
4)
2N/A if (n >=
3 && m <
4)
2N/A /* mbtowc does not distinguish between invalid and incomplete multibyte 2N/A sequences. But mbrtowc needs to make this distinction. 2N/A There are two possible approaches: 2N/A - Use iconv() and its return value. 2N/A - Use built-in knowledge about the possible encodings. 2N/A Given the low quality of implementation of iconv() on the systems that 2N/A lack mbrtowc(), we use the second approach. 2N/A The possible encodings are: 2N/A - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, 2N/A Use specialized code for each. */ 2N/A /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ 2N/A if (
STREQ (
encoding,
"UTF-8",
'U',
'T',
'F',
'-',
'8', 0, 0, 0, 0))
2N/A unsigned char c = (
unsigned char) p[0];
2N/A unsigned char c2 = (
unsigned char) p[
1];
2N/A && (c >=
0xe1 ||
c2 >=
0xa0)
2N/A && (c !=
0xed ||
c2 <
0xa0))
2N/A else /* m == 2 || m == 3 */ 2N/A unsigned char c2 = (
unsigned char) p[
1];
2N/A && (c >=
0xf1 ||
c2 >=
0x90)
2N/A && (c <
0xf4 || (c ==
0xf4 &&
c2 <
0x90)))
2N/A unsigned char c3 = (
unsigned char) p[
2];
2N/A /* As a reference for this code, you can use the GNU libiconv 2N/A implementation. Look for uses of the RET_TOOFEW macro. */ 2N/A if (
STREQ (
encoding,
"EUC-JP",
'E',
'U',
'C',
'-',
'J',
'P', 0, 0, 0))
2N/A unsigned char c = (
unsigned char) p[0];
2N/A if ((c >=
0xa1 && c <
0xff) || c ==
0x8e || c ==
0x8f)
2N/A unsigned char c = (
unsigned char) p[0];
2N/A unsigned char c2 = (
unsigned char) p[
1];
2N/A if (
STREQ (
encoding,
"EUC-KR",
'E',
'U',
'C',
'-',
'K',
'R', 0, 0, 0)
2N/A ||
STREQ (
encoding,
"GB2312",
'G',
'B',
'2',
'3',
'1',
'2', 0, 0, 0)
2N/A ||
STREQ (
encoding,
"BIG5",
'B',
'I',
'G',
'5', 0, 0, 0, 0, 0))
2N/A unsigned char c = (
unsigned char) p[0];
2N/A if (c >=
0xa1 && c <
0xff)
2N/A if (
STREQ (
encoding,
"EUC-TW",
'E',
'U',
'C',
'-',
'T',
'W', 0, 0, 0))
2N/A unsigned char c = (
unsigned char) p[0];
2N/A if ((c >=
0xa1 && c <
0xff) || c ==
0x8e)
2N/A else /* m == 2 || m == 3 */ 2N/A unsigned char c = (
unsigned char) p[0];
2N/A if (
STREQ (
encoding,
"GB18030",
'G',
'B',
'1',
'8',
'0',
'3',
'0', 0, 0))
2N/A unsigned char c = (
unsigned char) p[0];
2N/A if ((c >=
0x90 && c <=
0xe3) || (c >=
0xf8 && c <=
0xfe))
2N/A else /* m == 2 || m == 3 */ 2N/A unsigned char c = (
unsigned char) p[0];
2N/A if (c >=
0x90 && c <=
0xe3)
2N/A unsigned char c2 = (
unsigned char) p[
1];
2N/A unsigned char c3 = (
unsigned char) p[
2];
2N/A if (
STREQ (
encoding,
"SJIS",
'S',
'J',
'I',
'S', 0, 0, 0, 0, 0))
2N/A unsigned char c = (
unsigned char) p[0];
2N/A if ((c >=
0x81 && c <=
0x9f) || (c >=
0xe0 && c <=
0xea)
2N/A || (c >=
0xf0 && c <=
0xf9))
2N/A /* An unknown multibyte encoding. */ 2N/A /* Here 0 <= k < m < 4. */ 2N/A /* The conversion state is undefined, says POSIX. */ 2N/A/* Override the system's mbrtowc() function. */ 2N/A /* Override mbrtowc's internal state. We can not call mbsinit() on the 2N/A hidden internal state, but we can call it on our variable. */ 2N/A /* Parse the rest of the multibyte character byte for byte. */ 2N/A for (; n > 0; s++, n--)
2N/A /* The multibyte character has been completed. */