1N/A * Copyright (C) 2000, 2001, 2002, 2003, 2004, by Larry Wall and others 1N/A * You may distribute under the terms of either the GNU General Public 1N/A * License or the Artistic License, as specified in the README file. 1N/A * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever 1N/A * heard of that we don't want to see any closer; and that's the one place 1N/A * we're trying to get to! And that's just where we can't get, nohow.' 1N/A * 'Well do I understand your speech,' he answered in the same language; 1N/A * 'yet few strangers do so. Why then do you not speak in the Common Tongue, 1N/A * as is the custom in the West, if you wish to be answered?' 1N/A * ...the travellers perceived that the floor was paved with stones of many 1N/A * hues; branching runes and strange devices intertwined beneath their feet. 1N/Astatic char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
1N/A=head1 Unicode Support 1N/A=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags 1N/AAdds the UTF-8 representation of the Unicode codepoint C<uv> to the end 1N/Aof the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free 1N/Abytes available. The return value is the pointer to the byte after the 1N/Aend of the new character. In other words, 1N/A d = uvuni_to_utf8_flags(d, uv, flags); 1N/A d = uvuni_to_utf8(d, uv); 1N/A(which is equivalent to) 1N/A d = uvuni_to_utf8_flags(d, uv, 0); 1N/Ais the recommended Unicode-aware way of saying 1N/A ((
uv &
0xFFFE) ==
0xFFFE &&
/* Either FFFE or FFFF. */ 1N/A /* UNICODE_ALLOW_SUPER includes 1N/A * FFFEs and FFFFs beyond 0x10FFFF. */ 1N/A "Unicode character 0x%04"UVxf" is illegal",
uv);
1N/A#
else /* Non loop style */ 1N/A *d++ = (
U8)(((
uv >>
6) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
12) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
6) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
18) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
12) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
6) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
24) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
18) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
12) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
6) &
0x3f) |
0x80);
1N/A *d++ =
0xfe;
/* Can't match U+FEFF! */ 1N/A *d++ = (
U8)(((
uv >>
30) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
24) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
18) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
12) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
6) &
0x3f) |
0x80);
1N/A *d++ =
0xff;
/* Can't match U+FFFE! */ 1N/A *d++ =
0x80;
/* 6 Reserved bits */ 1N/A *d++ = (
U8)(((
uv >>
60) &
0x0f) |
0x80);
/* 2 Reserved bits */ 1N/A *d++ = (
U8)(((
uv >>
54) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
48) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
42) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
36) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
30) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
24) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
18) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
12) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
6) &
0x3f) |
0x80);
1N/A#
endif /* Loop style */ 1N/A=for apidoc A|STRLEN|is_utf8_char|U8 *s 1N/ATests if some arbitrary number of bytes begins in a valid UTF-8 1N/Acharacter. Note that an INVARIANT (i.e. ASCII) character is a valid 1N/AUTF-8 character. The actual number of bytes in the UTF-8 character 1N/Awill be returned if it is valid, otherwise 0. 1N/A=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len 1N/AReturns true if first C<len> bytes of the given string form a valid 1N/AUTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does 1N/Anot mean 'a string that contains code points above 0x7F encoded in UTF-8' 1N/Abecause a valid ASCII string is a valid UTF-8 string. 1N/A /* Inline the easy bits of is_utf8_char() here for speed... */ 1N/A /* ... and call is_utf8_char() only if really needed. */ 1N/A=for apidoc A|bool|is_utf8_string_loc|U8 *s|STRLEN len|U8 **p 1N/ALike is_ut8_string but store the location of the failure in 1N/A /* Inline the easy bits of is_utf8_char() here for speed... */ 1N/A /* ... and call is_utf8_char() only if really needed. */ 1N/A=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags 1N/ABottom level UTF-8 decode routine. 1N/AReturns the unicode code point value of the first character in the string C<s> 1N/Awhich is assumed to be in UTF-8 encoding and no longer than C<curlen>; 1N/AC<retlen> will be set to the length, in bytes, of that character. 1N/AIf C<s> does not point to a well-formed UTF-8 character, the behaviour 1N/Ais dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY, 1N/Ait is assumed that the caller will raise a warning, and this function 1N/Awill silently just set C<retlen> to C<-1> and return zero. If the 1N/AC<flags> does not contain UTF8_CHECK_ONLY, warnings about 1N/Amalformations will be given, C<retlen> will be set to the expected 1N/Alength of the UTF-8 character in bytes, and zero will be returned. 1N/AThe C<flags> can also contain various flags to allow deviations from 1N/AMost code should use utf8_to_uvchr() rather than call this directly. 1N/A/* This list is a superset of the UTF8_ALLOW_XXX. */ 1N/A /* These cannot be allowed. */ 1N/A else {
/* uv < ouv */ 1N/A /* This cannot be allowed. */ 1N/A case 0:
/* Intentionally empty. */ break;
1N/A=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen 1N/AReturns the native character value of the first character in the string C<s> 1N/Awhich is assumed to be in UTF-8 encoding; C<retlen> will be set to the 1N/Alength, in bytes, of that character. 1N/AIf C<s> does not point to a well-formed UTF-8 character, zero is 1N/Areturned and retlen is set, if possible, to -1. 1N/A=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen 1N/AReturns the Unicode code point of the first character in the string C<s> 1N/Awhich is assumed to be in UTF-8 encoding; C<retlen> will be set to the 1N/Alength, in bytes, of that character. 1N/AThis function should only be used when returned UV is considered 1N/Aan index into the Unicode semantic tables (e.g. swashes). 1N/AIf C<s> does not point to a well-formed UTF-8 character, zero is 1N/Areturned and retlen is set, if possible, to -1. 1N/A /* Call the low level routine asking for checks */ 1N/A=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e 1N/AReturn the length of the UTF-8 char encoded string C<s> in characters. 1N/AStops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end 1N/Aup past C<e>, croaks. 1N/A /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. 1N/A * the bitops (especially ~) can create illegal UTF-8. 1N/A * In other words: in Perl UTF-8 is not just for Unicode. */ 1N/A=for apidoc A|IV|utf8_distance|U8 *a|U8 *b 1N/AReturns the number of UTF-8 characters between the UTF-8 pointers C<a> 1N/AWARNING: use only if you *know* that the pointers point inside the 1N/A /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. 1N/A * the bitops (especially ~) can create illegal UTF-8. 1N/A * In other words: in Perl UTF-8 is not just for Unicode. */ 1N/A=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off 1N/AReturn the UTF-8 pointer C<s> displaced by C<off> characters, either 1N/AWARNING: do not use the following unless you *know* C<off> is within 1N/Athe UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned 1N/Aon the first byte of character or just after the last byte of a character. 1N/A /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 1N/A * the bitops (especially ~) can create illegal UTF-8. 1N/A * In other words: in Perl UTF-8 is not just for Unicode. */ 1N/A=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len 1N/AConverts a string C<s> of length C<len> from UTF-8 into byte encoding. 1N/AUnlike C<bytes_to_utf8>, this over-writes the original string, and 1N/Aupdates len to contain the new length. 1N/AReturns zero on failure, setting C<len> to -1. 1N/A /* ensure valid UTF-8 and chars < 256 before updating string */ 1N/A=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8 1N/AConverts a string C<s> of length C<len> from UTF-8 into byte encoding. 1N/AUnlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to 1N/Athe newly-created string, and updates C<len> to contain the new 1N/Alength. Returns the original string if no conversion occurs, C<len> 1N/Ais unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to 1N/A0 if C<s> is converted or contains all 7bit characters. 1N/A /* ensure valid UTF-8 and chars < 256 before converting string */ 1N/A /* Then it is two-byte encoded */ 1N/A=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len 1N/AConverts a string C<s> of length C<len> from ASCII into UTF-8 encoding. 1N/AReturns a pointer to the newly-created string, and sets C<len> to 1N/Areflect the new length. 1N/AIf you want to convert to UTF-8 from other encodings than ASCII, 1N/Asee sv_recode_to_utf8(). 1N/A * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8. 1N/A * Destination must be pre-extended to 3/2 source. Do not use in-place. 1N/A * We optimize for native, for obvious reasons. */ 1N/A UV uv = (p[0] <<
8) + p[
1];
/* UTF-16BE */ 1N/A if (
uv >=
0xd800 &&
uv <
0xdbff) {
/* surrogates */ 1N/A uv = ((
uv -
0xd800) <<
10) + (
low -
0xdc00) +
0x10000;
1N/A *d++ = (
U8)(((
uv >>
6) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
12) &
0x3f) |
0x80);
1N/A *d++ = (
U8)(((
uv >>
6) &
0x3f) |
0x80);
1N/A/* Note: this one is slightly destructive of the source. */ 1N/A/* for now these are all defined (inefficiently) in terms of the utf8 versions */ 1N/A/* for now these all assume no locale info available for Unicode > 255 */ 1N/A /* XXX returns only the first character -- do not use XXX */ 1N/A /* XXX no locale support yet */ 1N/A /* XXX returns only the first character XXX -- do not use XXX */ 1N/A /* XXX no locale support yet */ 1N/A /* XXX returns only the first character -- do not use XXX */ 1N/A /* XXX no locale support yet */ 1N/A /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true 1N/A * descendant of isalnum(3), in other words, it doesn't 1N/A * contain the '_'. --jhi */ 1N/A/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */ 1N/A/* return is_utf8_alpha(p) || is_utf8_digit(p); */ 1N/A=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special 1N/AThe "p" contains the pointer to the UTF-8 string encoding 1N/Athe character that is being converted. 1N/AThe "ustrp" is a pointer to the character buffer to put the 1N/Aconversion result to. The "lenp" is a pointer to the length 1N/AThe "swashp" is a pointer to the swash to use. 1N/Abut not always, a multicharacter mapping), is tried first. 1N/AThe "special" is a string like "utf8::ToSpecLower", which means the 1N/Ahash %utf8::ToSpecLower. The access to the hash is through 1N/AThe "normal" is a string like "ToLower" which means the swash 1N/A /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings 1N/A * are necessary in EBCDIC, they are redundant no-ops 1N/A * in ASCII-ish platforms, and hopefully optimized away. */ 1N/A /* The 0xDF is the only special casing Unicode code point below 0x100. */ 1N/A /* It might be "special" (sometimes, but not always, 1N/A * a multicharacter mapping) */ 1N/A /* If we have EBCDIC we need to remap the characters 1N/A * since any characters in the low 256 are Unicode 1N/A * code points, not EBCDIC. */ 1N/A /* It was "normal" (a single character mapping). */ 1N/A if (!
len)
/* Neither: just copy. */ 1N/A=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp 1N/AConvert the UTF-8 encoded character at p to its uppercase version and 1N/Astore that in UTF-8 in ustrp and its length in bytes in lenp. Note 1N/Athat the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the 1N/Auppercase version may be longer than the original character (up to two 1N/AThe first character of the uppercased version is returned 1N/A(but note, as explained above, that there may be more.) 1N/A=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp 1N/AConvert the UTF-8 encoded character at p to its titlecase version and 1N/Astore that in UTF-8 in ustrp and its length in bytes in lenp. Note 1N/Athat the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the 1N/Atitlecase version may be longer than the original character (up to two 1N/AThe first character of the titlecased version is returned 1N/A(but note, as explained above, that there may be more.) 1N/A=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp 1N/AConvert the UTF-8 encoded character at p to its lowercase version and 1N/Astore that in UTF-8 in ustrp and its length in bytes in lenp. Note 1N/Athat the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the 1N/Alowercase version may be longer than the original character (up to two 1N/AThe first character of the lowercased version is returned 1N/A(but note, as explained above, that there may be more.) 1N/A=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp 1N/AConvert the UTF-8 encoded character at p to its foldcase version and 1N/Astore that in UTF-8 in ustrp and its length in bytes in lenp. Note 1N/Athat the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the 1N/Afoldcase version may be longer than the original character (up to 1N/AThe first character of the foldcased version is returned 1N/A(but note, as explained above, that there may be more.) 1N/A/* a "swash" is a swatch hash */ 1N/A /* XXX ought to be handled by lex_start */ 1N/A/* This API is wrong for special case conversions since we may need to 1N/A * return several Unicode characters for a single Unicode character 1N/A * the lower-level routine, and it is similarly broken for returning 1N/A * multiple values. --jhi */ 1N/A /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ 1N/A * then the "swatch" is a vec() for al the chars which start 1N/A * So the key in the hash (klen) is length of encoded char -1 1N/A /* If char in invariant then swatch is for all the invariant chars 1N/A * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK 1N/A /* If char is encoded then swatch is for the prefix */ 1N/A * This single-entry cache saves about 1/3 of the utf8 overhead in test 1N/A * suite. (That is, only 7-8% overall over just a hash cache. Still, 1N/A * it's nothing to sniff at.) Pity we usually come through at least 1N/A * two function calls to get here... 1N/A * NB: this code assumes that swatches are never modified, once generated! 1N/A /* Try our second-level swatch cache, kept in a hash. */ 1N/A /* If not cached, generate it via utf8::SWASHGET */ 1N/A /* We use utf8n_to_uvuni() as we want an index into 1N/A Unicode tables, not a native character number. 1N/A /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */ 1N/A=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv 1N/AAdds the UTF-8 representation of the Native codepoint C<uv> to the end 1N/Aof the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free 1N/Abytes available. The return value is the pointer to the byte after the 1N/Aend of the new character. In other words, 1N/A d = uvchr_to_utf8(d, uv); 1N/Ais the recommended wide native character-aware way of saying 1N/A/* On ASCII machines this is normally a macro but we want a 1N/A real function in case XS code wants it 1N/A=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags 1N/AReturns the native character value of the first character in the string C<s> 1N/Awhich is assumed to be in UTF-8 encoding; C<retlen> will be set to the 1N/Alength, in bytes, of that character. 1N/AAllows length and flags to be passed to low level routine. 1N/A/* On ASCII machines this is normally a macro but we want 1N/A a real function in case XS code wants it 1N/A=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags 1N/ABuild to the scalar dsv a displayable version of the string spv, 1N/Alength len, the displayable version being at most pvlim bytes long 1N/A(if longer, the rest is truncated and "..." will be appended). 1N/AThe flags argument can have UNI_DISPLAY_ISPRINT set to display 1N/AisPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH 1N/Ato display the \\[nrfta\\] as the backslashed versions (like '\n') 1N/A(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\). 1N/AUNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both 1N/AUNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on. 1N/AThe pointer to the PV of the dsv is returned. 1N/A /* isPRINT() is the locale-blind version. */ 1N/A=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags 1N/ABuild to the scalar dsv a displayable version of the scalar sv, 1N/Athe displayable version being at most pvlim bytes long 1N/A(if longer, the rest is truncated and "..." will be appended). 1N/AThe flags argument is as in pv_uni_display(). 1N/AThe pointer to the PV of the dsv is returned. 1N/A=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2 1N/AReturn true if the strings s1 and s2 differ case-insensitively, false 1N/Aif not (if they are equal case-insensitively). If u1 is true, the 1N/Astring s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true, 1N/Athe string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2 1N/Aare false, the respective string is assumed to be in native 8-bit 1N/AIf the pe1 and pe2 are non-NULL, the scanning pointers will be copied 1N/Ain there (they will point at the beginning of the I<next> character). 1N/AIf the pointers behind pe1 or pe2 are non-NULL, they are the end 1N/Apointers beyond which scanning will not continue under any 1N/Acircustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and 1N/As2+l2 will be used as goal end pointers that will also stop the scan, 1N/Aand which qualify towards defining a successful match: all the scans 1N/Athat define an explicit length must reach their goal pointers for 1N/AFor case-insensitiveness, the "casefolding" of Unicode is used 1N/A return 1;
/* mismatch; possible infinite loop or false positive */ 1N/A natbuf[
1] = 0;
/* Need to terminate the buffer. */ 1N/A return 1;
/* mismatch */ 1N/A /* A match is defined by all the scans that specified 1N/A * an explicit length reaching their final goals. */ 1N/A return match ? 0 :
1;
/* 0 match, 1 mismatch */