unichar.h revision c6ead31ba07401556abe0c69374d7fbed99844e7
c25356d5978632df6203437e1953bcb29e0c736fTimo Sirainen#ifndef UNICHAR_H
c25356d5978632df6203437e1953bcb29e0c736fTimo Sirainen#define UNICHAR_H
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen/* Character used to replace invalid input. */
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen#define UNICODE_REPLACEMENT_CHAR 0xfffd
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen/* Characters >= base require surrogates */
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen#define UTF16_SURROGATE_BASE 0x10000
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen#define UTF16_SURROGATE_SHIFT 10
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen#define UTF16_SURROGATE_MASK 0x03ff
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen#define UTF16_SURROGATE_HIGH_FIRST 0xd800
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen#define UTF16_SURROGATE_HIGH_LAST 0xdbff
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen#define UTF16_SURROGATE_HIGH_MAX 0xdfff
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen#define UTF16_SURROGATE_LOW_FIRST 0xdc00
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen#define UTF16_SURROGATE_LOW_LAST 0xdfff
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen#define UTF16_SURROGATE_HIGH(chr) \
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen (UTF16_SURROGATE_HIGH_FIRST + \
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen (((chr) - UTF16_SURROGATE_BASE) >> UTF16_SURROGATE_SHIFT))
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen#define UTF16_SURROGATE_LOW(chr) \
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen (UTF16_SURROGATE_LOW_FIRST + \
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen (((chr) - UTF16_SURROGATE_BASE) & UTF16_SURROGATE_MASK))
0b2b090cdc3d36f30d6d2ec99b35ac0b7657d538Timo Sirainen
f2de6ecc4424533633aea705f12d0f691d7ddf81Timo Sirainen#define UTF8_REPLACEMENT_CHAR_LEN 3
f2de6ecc4424533633aea705f12d0f691d7ddf81Timo Sirainen
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainentypedef uint32_t unichar_t;
54df49100a0111a956662cb8a327969badd2d72dTimo SirainenARRAY_DEFINE_TYPE(unichars, unichar_t);
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen
f2de6ecc4424533633aea705f12d0f691d7ddf81Timo Sirainenextern const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN];
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenextern const uint8_t *const uni_utf8_non1_bytes;
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen/* Returns number of characters in a NUL-terminated unicode string */
68a4946b12583b88fa802e52ebee45cd96056772Timo Sirainenunsigned int uni_strlen(const unichar_t *str) ATTR_PURE;
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen/* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen invalid */
54df49100a0111a956662cb8a327969badd2d72dTimo Sirainenint uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output);
c6ead31ba07401556abe0c69374d7fbed99844e7Timo Sirainenint uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
c6ead31ba07401556abe0c69374d7fbed99844e7Timo Sirainen ARRAY_TYPE(unichars) *output);
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen/* Translates UCS-4 input to UTF-8 output. */
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainenvoid uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output);
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenvoid uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output);
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen/* Returns 1 if *chr_r is set, 0 for incomplete trailing character,
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen -1 for invalid input. */
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenint uni_utf8_get_char(const char *input, unichar_t *chr_r);
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenint uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r);
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen/* Returns UTF-8 string length with maximum input size. */
68a4946b12583b88fa802e52ebee45cd96056772Timo Sirainenunsigned int uni_utf8_strlen_n(const void *input, size_t size) ATTR_PURE;
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen
a0044466cc46baf25a316ea63781c60aa52b58caTimo Sirainen/* Returns the number of bytes belonging to this UTF-8 character. The given
a0044466cc46baf25a316ea63781c60aa52b58caTimo Sirainen parameter is the first byte of the UTF-8 sequence. Invalid input is
a0044466cc46baf25a316ea63781c60aa52b58caTimo Sirainen returned with length 1. */
68a4946b12583b88fa802e52ebee45cd96056772Timo Sirainenstatic inline unsigned int ATTR_CONST
68a4946b12583b88fa802e52ebee45cd96056772Timo Sirainenuni_utf8_char_bytes(char chr)
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen{
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen if ((uint8_t)chr < (192 + 2))
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen return 1;
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen return uni_utf8_non1_bytes[(uint8_t)chr - (192 + 2)];
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen}
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen/* Return given character in titlecase. */
68a4946b12583b88fa802e52ebee45cd96056772Timo Sirainenunichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen output buffer. Returns 0 if ok, -1 if input was invalid. This generates
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen output that's compatible with i;unicode-casemap comparator. Invalid input
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen is replaced with unicode replacement character (0xfffd). */
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainenint uni_utf8_to_decomposed_titlecase(const void *input, size_t max_len,
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen buffer_t *output);
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen/* If input contains only valid UTF-8 characters, return TRUE without updating
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen buf. If input contains invalid UTF-8 characters, replace them with unicode
8e9666f46faceeef0f3c6f706f10f3a873e4b0ebTimo Sirainen replacement character (0xfffd), write the output to buf and return FALSE. */
7aa59f55d8a4e02c7039fbd22660c4055bfc8393Timo Sirainenbool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
7aa59f55d8a4e02c7039fbd22660c4055bfc8393Timo Sirainen buffer_t *buf);
ef1d718c6a3a3a48b9835b004b8496de9dc4bec5Timo Sirainen/* Returns TRUE if string is valid UTF-8 input. */
ef1d718c6a3a3a48b9835b004b8496de9dc4bec5Timo Sirainenbool uni_utf8_str_is_valid(const char *str);
296857bde8dbe965bcfe5e96cf06d37c297d9315Timo Sirainen/* Returns TRUE if data contains only valid UTF-8 input. */
296857bde8dbe965bcfe5e96cf06d37c297d9315Timo Sirainenbool uni_utf8_data_is_valid(const unsigned char *data, size_t size);
511ba4416aafb9f9ba1a4193703b95a033267068Timo Sirainen
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen#endif