UTF8CodingSystem.cxx revision 7c478bd95313f5f23a4c958a745db2134aa03244
// Copyright (c) 1994 James Clark
// See the file COPYING for copying permission.
#pragma ident "%Z%%M% %I% %E% SMI"
#include "splib.h"
#ifdef SP_MULTI_BYTE
#include "UTF8CodingSystem.h"
#include "constant.h"
#ifdef SP_NAMESPACE
namespace SP_NAMESPACE {
#endif
enum {
// cmaskN is mask for first byte to test for N byte sequence
cmask1 = 0x80,
cmask2 = 0xe0,
cmask3 = 0xf0,
cmask4 = 0xf8,
cmask5 = 0xfc,
cmask6 = 0xfe,
// cvalN is value of masked first byte of N byte sequence
cval1 = 0x00,
cval2 = 0xc0,
cval3 = 0xe0,
cval4 = 0xf0,
cval5 = 0xf8,
cval6 = 0xfc,
// vmaskN is mask to get value from first byte in N byte sequence
vmask2 = 0x1f,
vmask3 = 0xf,
vmask4 = 0x7,
vmask5 = 0x3,
vmask6 = 0x1,
// minN is minimum legal resulting value for N byte sequence
min2 = 0x80,
min3 = 0x800,
min4 = 0x10000,
min5 = 0x200000,
min6 = 0x4000000,
max6 = 0x7fffffff
};
class UTF8Decoder : public Decoder {
public:
UTF8Decoder();
size_t decode(Char *, const char *, size_t, const char **);
private:
// value for encoding error
enum { invalid = 0xfffd };
Boolean recovering_;
};
class UTF8Encoder : public Encoder {
public:
UTF8Encoder();
void output(const Char *, size_t, OutputByteStream *);
void output(Char *tmp_char, size_t tmp_size_t, OutputByteStream *tmp_obs) {
output((const Char *)tmp_char, (size_t) tmp_size_t, (OutputByteStream *)tmp_obs);
}
};
Decoder *UTF8CodingSystem::makeDecoder() const
{
return new UTF8Decoder;
}
Encoder *UTF8CodingSystem::makeEncoder() const
{
return new UTF8Encoder;
}
UTF8Decoder::UTF8Decoder()
: recovering_(0)
{
}
size_t UTF8Decoder::decode(Char *to, const char *s,
size_t slen, const char **result)
{
Char *start = to;
const unsigned char *us = (const unsigned char *)s;
if (recovering_) {
recovering_ = 0;
goto recover;
}
while (slen > 0) {
unsigned c0;
c0 = us[0];
if ((c0 & cmask1) == cval1) {
*to++ = c0;
us++;
slen--;
}
else if ((c0 & cmask2) == cval2) {
if (slen < 2)
goto done;
unsigned c1 = us[1] ^ 0x80;
if (c1 & 0xc0)
goto error;
unsigned c = ((c0 & vmask2) << 6) | c1;
if (c < min2)
c = invalid;
*to++ = c;
slen -= 2;
us += 2;
}
else if ((c0 & cmask3) == cval3) {
if (slen < 3)
goto done;
unsigned c1 = us[1] ^ 0x80;
unsigned c2 = us[2] ^ 0x80;
if ((c1 | c2) & 0xc0)
goto error;
unsigned c = ((((c0 & vmask3) << 6) | c1) << 6) | c2;
if (c < min3)
c = invalid;
*to++ = c;
slen -= 3;
us += 3;
}
else if ((c0 & cmask4) == cval4) {
if (slen < 4)
goto done;
unsigned c1 = us[1] ^ 0x80;
unsigned c2 = us[2] ^ 0x80;
unsigned c3 = us[3] ^ 0x80;
if ((c1 | c2 | c3) & 0xc0)
goto error;
if (charMax < min5 - 1)
*to++ = invalid;
else {
unsigned long c = ((((c0 & vmask4) << 6) | c1) << 6) | c2;
c = (c << 6) | c3;
if (c < min4)
c = invalid;
*to++ = c;
}
slen -= 4;
us += 4;
}
else if ((c0 & cmask5) == cval5) {
if (slen < 5)
goto done;
unsigned c1 = us[1] ^ 0x80;
unsigned c2 = us[2] ^ 0x80;
unsigned c3 = us[3] ^ 0x80;
unsigned c4 = us[4] ^ 0x80;
if ((c1 | c2 | c3 | c4) & 0xc0)
goto error;
if (charMax < min6 - 1)
*to++ = invalid;
else {
unsigned long c = ((((c0 & vmask5) << 6) | c1) << 6) | c2;
c = (((c << 6) | c3) << 6) | c4;
if (c < min5)
c = invalid;
*to++ = c;
}
slen -= 5;
us += 5;
}
else if ((c0 & cmask6) == cval6) {
if (slen < 6)
goto done;
unsigned c1 = us[1] ^ 0x80;
unsigned c2 = us[2] ^ 0x80;
unsigned c3 = us[3] ^ 0x80;
unsigned c4 = us[4] ^ 0x80;
unsigned c5 = us[5] ^ 0x80;
if ((c1 | c2 | c3 | c4 | c5) & 0xc0)
goto error;
if (charMax < max6)
*to++ = invalid;
else {
unsigned long c = ((((c0 & vmask6) << 6) | c1) << 6) | c2;
c = (((((c << 6) | c3) << 6) | c4) << 6) | c5;
if (c < min6)
c = invalid;
*to++ = c;
}
slen -= 6;
us += 6;
}
else {
error:
us++;
slen--;
*to++ = invalid;
recover:
for (;;) {
if (slen == 0) {
recovering_ = 1;
goto done;
}
if ((*us & 0xc0) != 0x80)
break;
us++;
slen--;
}
}
}
done:
*result = (char *)us;
return to - start;
}
UTF8Encoder::UTF8Encoder()
{
}
void UTF8Encoder::output(const Char *s, size_t n, OutputByteStream *sb)
{
for (; n > 0; s++, n--) {
Char c = *s;
if (c < min2)
sb->sputc((unsigned char)c);
else if (c < min3) {
sb->sputc((c >> 6) | cval2);
sb->sputc((c & 0x3f) | 0x80);
}
else if (c < min4) {
sb->sputc((c >> 12) | cval3);
sb->sputc(((c >> 6) & 0x3f) | 0x80);
sb->sputc((c & 0x3f) | 0x80);
}
else if (c < min5) {
sb->sputc((c >> 18) | cval4);
sb->sputc(((c >> 12) & 0x3f) | 0x80);
sb->sputc(((c >> 6) & 0x3f) | 0x80);
sb->sputc((c & 0x3f) | 0x80);
}
else if (c < min6) {
sb->sputc((c >> 24) | cval5);
sb->sputc(((c >> 18) & 0x3f) | 0x80);
sb->sputc(((c >> 12) & 0x3f) | 0x80);
sb->sputc(((c >> 6) & 0x3f) | 0x80);
sb->sputc((c & 0x3f) | 0x80);
}
else if (c <= max6) {
sb->sputc((c >> 30) | cval6);
sb->sputc(((c >> 24) & 0x3f) | 0x80);
sb->sputc(((c >> 18) & 0x3f) | 0x80);
sb->sputc(((c >> 12) & 0x3f) | 0x80);
sb->sputc(((c >> 6) & 0x3f) | 0x80);
sb->sputc((c & 0x3f) | 0x80);
}
}
}
#ifdef SP_NAMESPACE
}
#endif
#else /* not SP_MULTI_BYTE */
#ifndef __GNUG__
static char non_empty_translation_unit; // sigh
#endif
#endif /* not SP_MULTI_BYTE */