8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen/* Character used to replace invalid input. */
c8b84f03c71e18f07940d1b60a77a4caf5e7c23bTimo Sirainen#define UNICODE_REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBD"
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen/* Characters >= base require surrogates */
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen (((chr) - UTF16_SURROGATE_BASE) >> UTF16_SURROGATE_SHIFT))
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen (((chr) - UTF16_SURROGATE_BASE) & UTF16_SURROGATE_MASK))
3b02371c120dcd455a09148abc5a3f520880fef2Timo Sirainen/* Returns TRUE if given byte is ASCII character or the beginning of a
3b02371c120dcd455a09148abc5a3f520880fef2Timo Sirainen multibyte UTF-8 sequence */
2bb5b6721e9971b3bcbb2da48eebead7fd9488eeAki Tuomi#define UTF16_VALID_HIGH_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_HIGH_FIRST)
2bb5b6721e9971b3bcbb2da48eebead7fd9488eeAki Tuomi#define UTF16_VALID_LOW_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_LOW_FIRST)
d9076f5939edf5d20a261494b1a861dcbb0d32e2Timo Sirainen/* Normalize UTF8 input and append it to output buffer.
d9076f5939edf5d20a261494b1a861dcbb0d32e2Timo Sirainen Returns 0 if ok, -1 if input was invalid. Even if input was invalid,
d9076f5939edf5d20a261494b1a861dcbb0d32e2Timo Sirainen as much as possible should be added to output. */
d9076f5939edf5d20a261494b1a861dcbb0d32e2Timo Sirainentypedef int normalizer_func_t(const void *input, size_t size,
f2de6ecc4424533633aea705f12d0f691d7ddf81Timo Sirainenextern const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN];
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenextern const uint8_t *const uni_utf8_non1_bytes;
5fc1d7c7caffa7e5616a1681503dfea0fc582aaeAki Tuomistatic inline bool ATTR_PURE uni_is_valid_ucs4(unichar_t chr)
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen/* Returns number of characters in a NUL-terminated unicode string */
68a4946b12583b88fa802e52ebee45cd96056772Timo Sirainenunsigned int uni_strlen(const unichar_t *str) ATTR_PURE;
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen/* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was
54df49100a0111a956662cb8a327969badd2d72dTimo Sirainenint uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output);
c6ead31ba07401556abe0c69374d7fbed99844e7Timo Sirainenint uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen/* Translates UCS-4 input to UTF-8 output. */
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainenvoid uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output);
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenvoid uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output);
304a9d2db2669ad910577e00dce2f81bfd0d5d39Phil Carmody/* Returns char_bytes (>0) if *chr_r is set, 0 for incomplete trailing character,
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen -1 for invalid input. */
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenint uni_utf8_get_char(const char *input, unichar_t *chr_r);
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenint uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r);
32ae620015da6ab2ec28e04d3cdcdb4420f1fa6bTimo Sirainen/* Returns number of characters in UTF-8 string. */
88311240b8db117b120171a861a64e399dab57afTimo Sirainenunsigned int uni_utf8_strlen(const char *input) ATTR_PURE;
32ae620015da6ab2ec28e04d3cdcdb4420f1fa6bTimo Sirainen/* Returns number of characters in UTF-8 input of specified size. */
68a4946b12583b88fa802e52ebee45cd96056772Timo Sirainenunsigned int uni_utf8_strlen_n(const void *input, size_t size) ATTR_PURE;
f66c8939c39e6bcd9dd5482bfd9689bd177ce0d4Timo Sirainen/* Same as uni_utf8_strlen_n(), but if input ends with a partial UTF-8
f66c8939c39e6bcd9dd5482bfd9689bd177ce0d4Timo Sirainen character, don't include it in the return value and set partial_pos_r to
f66c8939c39e6bcd9dd5482bfd9689bd177ce0d4Timo Sirainen where the character begins. Otherwise partial_pos_r is set to the end
f66c8939c39e6bcd9dd5482bfd9689bd177ce0d4Timo Sirainen of the input. */
f66c8939c39e6bcd9dd5482bfd9689bd177ce0d4Timo Sirainenunsigned int uni_utf8_partial_strlen_n(const void *input, size_t size,
a0044466cc46baf25a316ea63781c60aa52b58caTimo Sirainen/* Returns the number of bytes belonging to this UTF-8 character. The given
a0044466cc46baf25a316ea63781c60aa52b58caTimo Sirainen parameter is the first byte of the UTF-8 sequence. Invalid input is
a0044466cc46baf25a316ea63781c60aa52b58caTimo Sirainen returned with length 1. */
68a4946b12583b88fa802e52ebee45cd96056772Timo Sirainenstatic inline unsigned int ATTR_CONST
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen return uni_utf8_non1_bytes[(uint8_t)chr - (192 + 2)];
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen/* Return given character in titlecase. */
68a4946b12583b88fa802e52ebee45cd96056772Timo Sirainenunichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen output buffer. Returns 0 if ok, -1 if input was invalid. This generates
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen output that's compatible with i;unicode-casemap comparator. Invalid input
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen is replaced with unicode replacement character (0xfffd). */
3412b625dd238cc0774db968e6c351b007a98e25Timo Sirainenint uni_utf8_to_decomposed_titlecase(const void *input, size_t size,
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen/* If input contains only valid UTF-8 characters, return TRUE without updating
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen buf. If input contains invalid UTF-8 characters, replace them with unicode
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen replacement character (0xfffd), write the output to buf and return FALSE. */
7aa59f55d8a4e02c7039fbd22660c4055bfc8393Timo Sirainenbool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
ef1d718c6a3a3a48b9835b004b8496de9dc4bec5Timo Sirainen/* Returns TRUE if string is valid UTF-8 input. */
296857bde8dbe965bcfe5e96cf06d37c297d9315Timo Sirainen/* Returns TRUE if data contains only valid UTF-8 input. */
296857bde8dbe965bcfe5e96cf06d37c297d9315Timo Sirainenbool uni_utf8_data_is_valid(const unsigned char *data, size_t size);
2bb5b6721e9971b3bcbb2da48eebead7fd9488eeAki Tuomi/* surrogate handling */
2bb5b6721e9971b3bcbb2da48eebead7fd9488eeAki Tuomistatic inline unichar_t uni_join_surrogate(unichar_t high, unichar_t low)
2bb5b6721e9971b3bcbb2da48eebead7fd9488eeAki Tuomi return ((high - UTF16_SURROGATE_HIGH_FIRST)<<10) +
2bb5b6721e9971b3bcbb2da48eebead7fd9488eeAki Tuomistatic inline void uni_split_surrogate(unichar_t chr, unichar_t *high_r, unichar_t *low_r)