utf8.c revision af062818b47340eef15700d2f0211576ba3506ee
/*
* UTF-8 support routines
*
* Copyright 2000 Alexandre Julliard
*
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
*/
#include <string.h>
/* number of following bytes in sequence based on first byte value (for bytes above 0x7f) */
static const char utf8_length[128] =
{
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0 /* 0xf0-0xff */
};
/* first byte mask depending on UTF-8 sequence length */
/* minimum Unicode value depending on UTF-8 sequence length */
/* get the next char value taking surrogates into account */
{
{
return 0;
}
return src[0];
}
/* query necessary dst length for src string */
{
int len;
unsigned int val;
{
{
len++;
continue;
}
{
len += 2;
continue;
}
{
continue;
}
len += 3;
else /* 0x10000-0x10ffff: 4 bytes */
{
len += 4;
src++;
srclen--;
}
}
return len;
}
/* wide char to UTF-8 string conversion */
/* return -1 on dst buffer overflow, -2 on invalid input char */
{
int len;
{
unsigned int val;
{
continue;
}
{
ch >>= 6;
dst += 2;
continue;
}
{
continue;
}
{
val >>= 6;
val >>= 6;
dst += 3;
}
else /* 0x10000-0x10ffff: 4 bytes */
{
val >>= 6;
val >>= 6;
val >>= 6;
dst += 4;
src++;
srclen--;
}
}
}
/* helper for the various utf8 mbstowcs functions */
static inline unsigned int decode_utf8_char( unsigned char ch, const char **str, const char *strend )
{
switch(len)
{
case 3:
(*str)++;
case 2:
(*str)++;
case 1:
(*str)++;
return res;
}
return ~0;
}
/* query necessary dst length for src string with composition */
{
int ret = 0;
unsigned int res;
composed[0] = 0;
{
{
ret++;
continue;
}
{
if (composed[0])
{
}
ret++;
}
else if (res <= 0x10ffff)
{
ret += 2;
composed[0] = 0; /* no composition for surrogates */
}
/* otherwise ignore it */
}
return ret;
}
/* UTF-8 to wide char string conversion with composition */
/* return -1 on dst buffer overflow, -2 on invalid input char */
{
unsigned int res;
composed[0] = 0;
{
{
continue;
}
{
if (composed[0])
{
{
continue;
}
}
}
{
res -= 0x10000;
composed[0] = 0; /* no composition for surrogates */
}
/* otherwise ignore it */
}
}
/* query necessary dst length for src string */
{
int ret = 0;
unsigned int res;
{
{
ret++;
continue;
}
{
ret++;
}
/* otherwise ignore it */
}
return ret;
}
/* UTF-8 to wide char string conversion */
/* return -1 on dst buffer overflow, -2 on invalid input char */
{
unsigned int res;
{
{
continue;
}
{
}
{
res -= 0x10000;
}
/* otherwise ignore it */
}
}