/* utf8.h
*
* Copyright (C) 2000, 2001, 2002, by Larry Wall and others
*
* You may distribute under the terms of either the GNU General Public
* License or the Artistic License, as specified in the README file.
*
*/
/* Use UTF-8 as the default script encoding?
* Turning this on will break scripts having non-UTF-8 binary
* data (such as Latin-1) in string literals. */
#ifdef USE_UTF8_SCRIPTS
#else
#endif
#ifdef EBCDIC
/* The equivalent of these macros but implementing UTF-EBCDIC
are in the following header file:
*/
#include "utfebcdic.h"
#else
#ifdef DOINIT
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus */
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* scripts */
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6, /* cjk etc. */
7,13, /* Perl extended (not UTF-8). Up to 72bit allowed (64-bit + reserved). */
};
#else
EXTCONST unsigned char PL_utf8skip[];
#endif
/* Native character to iso-8859-1 */
/* Transform after encoding */
/* Transforms in wide UV chars */
/* Transforms in invariant space */
/* As there are no translations avoid the function wrapper */
/*
The following table is from Unicode 3.2.
Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
U+0000..U+007F 00..7F
U+0080..U+07FF C2..DF 80..BF
U+0800..U+0FFF E0 A0..BF 80..BF
U+1000..U+CFFF E1..EC 80..BF 80..BF
U+D000..U+D7FF ED 80..9F 80..BF
U+D800..U+DFFF ******* ill-formed *******
U+E000..U+FFFF EE..EF 80..BF 80..BF
U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF,
the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF.
The "gaps" are caused by legal UTF-8 avoiding non-shortest encodings:
it is technically possible to UTF-8-encode a single code point in different
ways, but that is explicitly forbidden, and the shortest possible encoding
should always be used (and that is what Perl does).
*/
/*
Another way to look at it, as bits:
Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
0aaaaaaa 0aaaaaaa
00000bbbbbaaaaaa 110bbbbb 10aaaaaa
ccccbbbbbbaaaaaa 1110cccc 10bbbbbb 10aaaaaa
00000dddccccccbbbbbbaaaaaa 11110ddd 10cccccc 10bbbbbb 10aaaaaa
As you can see, the continuation bytes all begin with C<10>, and the
leading bits of the start byte tell how many bytes the are in the
encoded character.
*/
#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK))
#ifdef HAS_QUAD
#else
/* No, I'm not even going to *TRY* putting #ifdef inside a #define */
#endif
/*
* Note: we try to be careful never to call the isXXX_utf8() functions
* unless we're pretty sure we've seen the beginning of a UTF-8 character
* (that is, the two high bits are set). Otherwise we risk loading in the
* heavy-duty SWASHINIT and SWASHGET routines unnecessarily.
*/
? isIDFIRST(*(p)) \
: isIDFIRST_utf8((U8*)p))
? isALNUM(*(p)) \
: isALNUM_utf8((U8*)p))
#endif /* EBCDIC vs ASCII */
/* Rest of these are attributes of Unicode and perl's internals rather than the encoding */
/* how wide can a single UTF-8 encoded character become */
/* how wide a character can become when upper/lowercased */
/* how wide a character can become when casefolded */
/* Though our UTF-8 encoding can go beyond this,
* let's be conservative and do as Unicode 3.2 says. */
(c) <= UNICODE_SURROGATE_LAST)
#ifdef HAS_QUAD
#endif
#ifdef EBCDIC
#else
#endif