/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
See the file COPYING for copying permission.
*/
#include <stddef.h>
#ifdef COMPILED_FROM_DSP
#include "winconfig.h"
#elif defined(MACOS_CLASSIC)
#include "macconfig.h"
#elif defined(__amigaos__)
#include "amigaconfig.h"
#elif defined(__WATCOMC__)
#include "watcomconfig.h"
#else
#ifdef HAVE_EXPAT_CONFIG_H
#include <expat_config.h>
#endif
#endif /* ndef COMPILED_FROM_DSP */
#include "expat_external.h"
#include "internal.h"
#include "xmltok.h"
#include "nametab.h"
#ifdef XML_DTD
#else
#endif
#define VTABLE1 \
PREFIX(nameLength), \
PREFIX(charRefNumber), \
PREFIX(updatePosition), \
/* A 2 byte UTF-8 representation splits the characters 11 bits between
the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
pages, 3 bits to add to that index and 5 bits to generate the mask.
*/
/* A 3 byte UTF-8 representation splits the characters 16 bits between
the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
into pages, 3 bits to add to that index and 5 bits to generate the
mask.
*/
<< 3) \
((n) == 2 \
? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
: ((n) == 3 \
? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
: 0))
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
with the additional restriction of not allowing the Unicode
code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
Implementation details:
(A & 0x80) == 0 means A < 0x80
and
(A & 0xC0) == 0xC0 means A > 0xBF
*/
#define UTF8_INVALID2(p) \
((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
#define UTF8_INVALID3(p) \
(((p)[2] & 0x80) == 0 \
|| \
((*p) == 0xEF && (p)[1] == 0xBF \
? \
(p)[2] > 0xBD \
: \
((p)[2] & 0xC0) == 0xC0) \
|| \
((*p) == 0xE0 \
? \
(p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
: \
((p)[1] & 0x80) == 0 \
|| \
((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
#define UTF8_INVALID4(p) \
(((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
|| \
((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
|| \
((*p) == 0xF0 \
? \
(p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
: \
((p)[1] & 0x80) == 0 \
|| \
((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
static int PTRFASTCALL
{
return 0;
}
static int PTRFASTCALL
{
return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
}
static int PTRFASTCALL
{
return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
}
static int PTRFASTCALL
{
return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
}
static int PTRFASTCALL
{
return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
}
static int PTRFASTCALL
{
return UTF8_INVALID2((const unsigned char *)p);
}
static int PTRFASTCALL
{
return UTF8_INVALID3((const unsigned char *)p);
}
static int PTRFASTCALL
{
return UTF8_INVALID4((const unsigned char *)p);
}
struct normal_encoding {
#ifdef XML_MIN_SIZE
#endif /* XML_MIN_SIZE */
};
#ifdef XML_MIN_SIZE
#define STANDARD_VTABLE(E) \
E ## byteType, \
E ## isNameMin, \
E ## isNmstrtMin, \
E ## byteToAscii, \
E ## charMatches,
#else
#endif
#define NORMAL_VTABLE(E) \
E ## isName2, \
E ## isName3, \
E ## isName4, \
E ## isNmstrt2, \
E ## isNmstrt3, \
E ## isNmstrt4, \
E ## isInvalid2, \
E ## isInvalid3, \
E ## isInvalid4
static int FASTCALL checkCharRefNumber(int);
#include "xmltok_impl.h"
#include "ascii.h"
#ifdef XML_MIN_SIZE
#endif
#ifdef XML_MIN_SIZE
#else
/* minimum bytes per character */
#endif
#ifdef XML_MIN_SIZE
static int PTRFASTCALL
{
return SB_BYTE_TYPE(enc, p);
}
#else
#endif
#ifdef XML_MIN_SIZE
static int PTRFASTCALL
{
return *p;
}
#else
#endif
#ifdef XML_MIN_SIZE
#else
#endif
#ifdef XML_MIN_SIZE
static int PTRCALL
{
return *p == c;
}
#else
/* c is an ASCII character */
#endif
#define XML_TOK_IMPL_C
#include "xmltok_impl.ci"
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
};
static void PTRCALL
{
char *to;
const char *from;
/* Avoid copying partial characters. */
break;
}
}
static void PTRCALL
{
case BT_LEAD2:
from += 2;
break;
case BT_LEAD3:
from += 3;
break;
case BT_LEAD4:
{
unsigned long n;
goto after;
n -= 0x10000;
to += 2;
from += 4;
}
break;
default:
break;
}
}
}
#ifdef XML_NS
{
#include "asciitab.h"
#include "utf8tab.h"
},
};
#endif
{
#include "asciitab.h"
#include "utf8tab.h"
},
};
#ifdef XML_NS
{
#include "iasciitab.h"
#include "utf8tab.h"
},
};
#endif
{
#include "iasciitab.h"
#include "utf8tab.h"
},
};
static void PTRCALL
{
for (;;) {
unsigned char c;
break;
c = (unsigned char)**fromP;
if (c & 0x80) {
break;
(*fromP)++;
}
else {
break;
}
}
}
static void PTRCALL
{
}
#ifdef XML_NS
{
#include "asciitab.h"
#include "latin1tab.h"
},
};
#endif
{
#include "asciitab.h"
#include "latin1tab.h"
},
};
static void PTRCALL
{
}
#ifdef XML_NS
{
#include "asciitab.h"
/* BT_NONXML == 0 */
},
};
#endif
{
#include "asciitab.h"
/* BT_NONXML == 0 */
},
};
static int PTRFASTCALL
{
switch ((unsigned char)hi) {
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
return BT_LEAD4;
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
return BT_TRAIL;
case 0xFF:
switch ((unsigned char)lo) {
case 0xFF:
case 0xFE:
return BT_NONXML;
}
break;
}
return BT_NONASCII;
}
#define DEFINE_UTF16_TO_UTF8(E) \
static void PTRCALL \
{ \
const char *from; \
int plane; \
unsigned char lo2; \
switch (hi) { \
case 0: \
if (lo < 0x80) { \
return; \
} \
break; \
} \
/* fall through */ \
case 0x1: case 0x2: case 0x3: \
case 0x4: case 0x5: case 0x6: case 0x7: \
return; \
} \
break; \
default: \
return; \
} \
/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
break; \
case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
return; \
} \
from += 2; \
| (lo2 >> 6) \
| 0x80); \
break; \
} \
} \
}
#define DEFINE_UTF16_TO_UTF16(E) \
static void PTRCALL \
{ \
/* Avoid copying first half only of surrogate */ \
fromLim -= 2; \
}
((p)[1] == 0 \
: unicode_byte_type((p)[1], (p)[0]))
#ifdef XML_MIN_SIZE
static int PTRFASTCALL
{
return LITTLE2_BYTE_TYPE(enc, p);
}
static int PTRFASTCALL
{
return LITTLE2_BYTE_TO_ASCII(enc, p);
}
static int PTRCALL
{
return LITTLE2_CHAR_MATCHES(enc, p, c);
}
static int PTRFASTCALL
{
return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
}
static int PTRFASTCALL
{
return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
}
#else /* not XML_MIN_SIZE */
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
#define IS_NAME_CHAR(enc, p, n) 0
#define IS_NMSTRT_CHAR(enc, p, n) (0)
#define XML_TOK_IMPL_C
#include "xmltok_impl.ci"
#endif /* not XML_MIN_SIZE */
#ifdef XML_NS
{ VTABLE, 2, 0,
#if BYTEORDER == 1234
1
#else
0
#endif
},
{
#include "asciitab.h"
#include "latin1tab.h"
},
};
#endif
{ VTABLE, 2, 0,
#if BYTEORDER == 1234
1
#else
0
#endif
},
{
#include "asciitab.h"
#include "latin1tab.h"
},
};
#if BYTEORDER != 4321
#ifdef XML_NS
{
#include "iasciitab.h"
#include "latin1tab.h"
},
};
#endif
{
#include "iasciitab.h"
#include "latin1tab.h"
},
};
#endif
((p)[0] == 0 \
: unicode_byte_type((p)[0], (p)[1]))
#ifdef XML_MIN_SIZE
static int PTRFASTCALL
{
return BIG2_BYTE_TYPE(enc, p);
}
static int PTRFASTCALL
{
return BIG2_BYTE_TO_ASCII(enc, p);
}
static int PTRCALL
{
return BIG2_CHAR_MATCHES(enc, p, c);
}
static int PTRFASTCALL
{
return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
}
static int PTRFASTCALL
{
return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
}
#else /* not XML_MIN_SIZE */
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
#define XML_TOK_IMPL_C
#include "xmltok_impl.ci"
#endif /* not XML_MIN_SIZE */
#ifdef XML_NS
{ VTABLE, 2, 0,
#if BYTEORDER == 4321
1
#else
0
#endif
},
{
#include "asciitab.h"
#include "latin1tab.h"
},
};
#endif
{ VTABLE, 2, 0,
#if BYTEORDER == 4321
1
#else
0
#endif
},
{
#include "asciitab.h"
#include "latin1tab.h"
},
};
#if BYTEORDER != 1234
#ifdef XML_NS
{
#include "iasciitab.h"
#include "latin1tab.h"
},
};
#endif
{
#include "iasciitab.h"
#include "latin1tab.h"
},
};
#endif
static int FASTCALL
{
for (;;) {
return 0;
if (!c1)
break;
}
return 1;
}
static void PTRCALL
{
}
static int
{
char *p = buf;
if (p == buf)
return -1;
else
return buf[0];
}
static int FASTCALL
isSpace(int c)
{
switch (c) {
case 0x20:
case 0xD:
case 0xA:
case 0x9:
return 1;
}
return 0;
}
/* Return 1 if there's just optional white space or there's an S
followed by name=val.
*/
static int
const char *ptr,
const char *end,
const char **namePtr,
const char **nameEndPtr,
const char **valPtr,
const char **nextTokPtr)
{
int c;
char open;
return 1;
}
*nextTokPtr = ptr;
return 0;
}
do {
return 1;
}
for (;;) {
if (c == -1) {
*nextTokPtr = ptr;
return 0;
}
if (c == ASCII_EQUALS) {
*nameEndPtr = ptr;
break;
}
if (isSpace(c)) {
*nameEndPtr = ptr;
do {
if (c != ASCII_EQUALS) {
*nextTokPtr = ptr;
return 0;
}
break;
}
}
*nextTokPtr = ptr;
return 0;
}
while (isSpace(c)) {
}
if (c != ASCII_QUOT && c != ASCII_APOS) {
*nextTokPtr = ptr;
return 0;
}
open = (char)c;
if (c == open)
break;
&& c != ASCII_PERIOD
&& c != ASCII_MINUS
&& c != ASCII_UNDERSCORE) {
*nextTokPtr = ptr;
return 0;
}
}
return 1;
}
static const char KW_version[] = {
};
static const char KW_encoding[] = {
};
static const char KW_standalone[] = {
};
static const char KW_yes[] = {
};
static const char KW_no[] = {
};
static int
const char *,
const char *),
int isGeneralTextEntity,
const char *ptr,
const char *end,
const char **badPtr,
const char **versionPtr,
const char **versionEndPtr,
const char **encodingName,
int *standalone)
{
|| !name) {
return 0;
}
if (!isGeneralTextEntity) {
return 0;
}
}
else {
if (versionPtr)
*versionPtr = val;
if (versionEndPtr)
*versionEndPtr = ptr;
return 0;
}
if (!name) {
if (isGeneralTextEntity) {
/* a TextDecl must have an EncodingDecl */
return 0;
}
return 1;
}
}
return 0;
}
if (encodingName)
*encodingName = val;
if (encoding)
return 0;
}
if (!name)
return 1;
}
|| isGeneralTextEntity) {
return 0;
}
if (standalone)
*standalone = 1;
}
if (standalone)
*standalone = 0;
}
else {
return 0;
}
return 0;
}
return 1;
}
static int FASTCALL
{
switch (result >> 8) {
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
return -1;
case 0:
return -1;
break;
case 0xFF:
return -1;
break;
}
return result;
}
int FASTCALL
{
enum {
/* minN is minimum legal resulting value for N byte sequence */
};
if (c < 0)
return 0;
if (c < min2) {
buf[0] = (char)(c | UTF8_cval1);
return 1;
}
if (c < min3) {
return 2;
}
if (c < min4) {
return 3;
}
if (c < 0x110000) {
return 4;
}
return 0;
}
int FASTCALL
{
if (charNum < 0)
return 0;
if (charNum < 0x10000) {
return 1;
}
if (charNum < 0x110000) {
charNum -= 0x10000;
return 2;
}
return 0;
}
struct unknown_encoding {
void *userData;
};
int
XmlSizeOfUnknownEncoding(void)
{
return sizeof(struct unknown_encoding);
}
static int PTRFASTCALL
{
if (c & ~0xFFFF)
return 0;
}
static int PTRFASTCALL
{
if (c & ~0xFFFF)
return 0;
}
static int PTRFASTCALL
{
return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
}
static void PTRCALL
{
for (;;) {
const char *utf8;
int n;
break;
n = *utf8++;
if (n == 0) {
n = XmlUtf8Encode(c, buf);
break;
- (BT_LEAD2 - 2));
}
else {
break;
(*fromP)++;
}
do {
} while (--n != 0);
}
}
static void PTRCALL
{
if (c == 0) {
c = (unsigned short)
- (BT_LEAD2 - 2));
}
else
(*fromP)++;
*(*toP)++ = c;
}
}
ENCODING *
int *table,
void *userData)
{
int i;
for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
((char *)mem)[i] = ((char *)&latin1_encoding)[i];
for (i = 0; i < 128; i++)
&& table[i] != i)
return 0;
for (i = 0; i < 256; i++) {
int c = table[i];
if (c == -1) {
/* This shouldn't really get used. */
e->utf16[i] = 0xFFFF;
e->utf8[i][0] = 1;
e->utf8[i][1] = 0;
}
else if (c < 0) {
if (c < -4)
return 0;
e->utf8[i][0] = 0;
e->utf16[i] = 0;
}
else if (c < 0x80) {
&& c != i)
return 0;
e->utf8[i][0] = 1;
e->utf8[i][1] = (char)c;
e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
}
else if (checkCharRefNumber(c) < 0) {
/* This shouldn't really get used. */
e->utf16[i] = 0xFFFF;
e->utf8[i][0] = 1;
e->utf8[i][1] = 0;
}
else {
if (c > 0xFFFF)
return 0;
else
e->utf16[i] = (unsigned short)c;
}
}
if (convert) {
}
}
/* If this enumeration is changed, getEncodingIndex and encodings
must also be changed. */
enum {
ISO_8859_1_ENC = 0,
/* must match encodingNames up to here */
};
static const char KW_ISO_8859_1[] = {
};
static const char KW_US_ASCII[] = {
'\0'
};
static const char KW_UTF_8[] = {
};
static const char KW_UTF_16[] = {
};
static const char KW_UTF_16BE[] = {
'\0'
};
static const char KW_UTF_16LE[] = {
'\0'
};
static int FASTCALL
{
static const char * const encodingNames[] = {
};
int i;
return NO_ENC;
for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
return i;
return UNKNOWN_ENC;
}
/* For binary compatibility, we store the index of the encoding
specified at initialization in the isUtf16 member.
*/
/* This is what detects the encoding. encodingTable maps from
encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
the external (protocol) specified encoding; state is
XML_CONTENT_STATE if we're parsing an external text entity, and
XML_PROLOG_STATE otherwise.
*/
static int
const INIT_ENCODING *enc,
int state,
const char *ptr,
const char *end,
const char **nextTokPtr)
{
return XML_TOK_NONE;
/* only a single byte available for auto-detection */
#ifndef XML_DTD /* FIXME */
/* a well-formed document entity must have more than one byte */
if (state != XML_CONTENT_STATE)
return XML_TOK_PARTIAL;
#endif
/* so we're parsing an external text entity... */
/* if UTF-16 was externally specified, then we need at least 2 bytes */
switch (INIT_ENC_INDEX(enc)) {
case UTF_16_ENC:
case UTF_16LE_ENC:
case UTF_16BE_ENC:
return XML_TOK_PARTIAL;
}
switch ((unsigned char)*ptr) {
case 0xFE:
case 0xFF:
case 0xEF: /* possibly first byte of UTF-8 BOM */
&& state == XML_CONTENT_STATE)
break;
/* fall through */
case 0x00:
case 0x3C:
return XML_TOK_PARTIAL;
}
}
else {
case 0xFEFF:
&& state == XML_CONTENT_STATE)
break;
return XML_TOK_BOM;
/* 00 3C is handled in the default case */
case 0x3C00:
&& state == XML_CONTENT_STATE)
break;
case 0xFFFE:
&& state == XML_CONTENT_STATE)
break;
return XML_TOK_BOM;
case 0xEFBB:
/* Maybe a UTF-8 BOM (EF BB BF) */
/* If there's an explicitly specified (external) encoding
of ISO-8859-1 or some flavour of UTF-16
and this is an external text entity,
don't look for the BOM,
because it might be a legal data.
*/
if (state == XML_CONTENT_STATE) {
int e = INIT_ENC_INDEX(enc);
if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
|| e == UTF_16LE_ENC || e == UTF_16_ENC)
break;
}
return XML_TOK_PARTIAL;
return XML_TOK_BOM;
}
break;
default:
if (ptr[0] == '\0') {
/* 0 isn't a legal data character. Furthermore a document
entity can only start with ASCII characters. So the only
way this can fail to be big-endian UTF-16 if it it's an
external parsed general entity that's labelled as
UTF-16LE.
*/
break;
}
/* We could recover here in the case:
- parsing an external entity
- second byte is 0
- no externally specified encoding
- no encoding declaration
by assuming UTF-16LE. But we don't, because this would mean when
presented just with a single byte, we couldn't reliably determine
whether we needed further bytes.
*/
if (state == XML_CONTENT_STATE)
break;
}
break;
}
}
}
#define NS(x) x
#define ns(x) x
#define XML_TOK_NS_C
#include "xmltok_ns.ci"
#ifdef XML_NS
#define XML_TOK_NS_C
#include "xmltok_ns.ci"
ENCODING *
int *table,
void *userData)
{
if (enc)
return enc;
}
#endif /* XML_NS */