src/lib/unichar.h

	unichar.h revision f66c8939c39e6bcd9dd5482bfd9689bd177ce0d4
9b2bd54c98edb185a3985410367754ab33217362bnicholes#ifndef UNICHAR_H
9b2bd54c98edb185a3985410367754ab33217362bnicholes#define UNICHAR_H
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Character used to replace invalid input. */
9b2bd54c98edb185a3985410367754ab33217362bnicholes#define UNICODE_REPLACEMENT_CHAR 0xfffd
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Characters >= base require surrogates */
9b2bd54c98edb185a3985410367754ab33217362bnicholes#define UTF16_SURROGATE_BASE 0x10000
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes#define UTF16_SURROGATE_SHIFT 10
0662ed52e814f8f08ef0e09956413a792584eddffuankg#define UTF16_SURROGATE_MASK 0x03ff
9b2bd54c98edb185a3985410367754ab33217362bnicholes#define UTF16_SURROGATE_HIGH_FIRST 0xd800
9b2bd54c98edb185a3985410367754ab33217362bnicholes#define UTF16_SURROGATE_HIGH_LAST 0xdbff
9b2bd54c98edb185a3985410367754ab33217362bnicholes#define UTF16_SURROGATE_HIGH_MAX 0xdfff
9b2bd54c98edb185a3985410367754ab33217362bnicholes#define UTF16_SURROGATE_LOW_FIRST 0xdc00
9b2bd54c98edb185a3985410367754ab33217362bnicholes#define UTF16_SURROGATE_LOW_LAST 0xdfff
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes#define UTF16_SURROGATE_HIGH(chr) \
70953fb44a7140fe206c3a5f011e24209c8c5c6abnicholes    (UTF16_SURROGATE_HIGH_FIRST + \
70953fb44a7140fe206c3a5f011e24209c8c5c6abnicholes     (((chr) - UTF16_SURROGATE_BASE) >> UTF16_SURROGATE_SHIFT))
44f575c8cb19a7a5cd61664a7848be6bc197df02fuankg#define UTF16_SURROGATE_LOW(chr) \
44f575c8cb19a7a5cd61664a7848be6bc197df02fuankg    (UTF16_SURROGATE_LOW_FIRST + \
16b55a35cff91315d261d1baa776138af465c4e4fuankg     (((chr) - UTF16_SURROGATE_BASE) & UTF16_SURROGATE_MASK))
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes#define UTF8_REPLACEMENT_CHAR_LEN 3
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholestypedef uint32_t unichar_t;
9b2bd54c98edb185a3985410367754ab33217362bnicholesARRAY_DEFINE_TYPE(unichars, unichar_t);
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Normalize UTF8 input and append it to output buffer.
9b2bd54c98edb185a3985410367754ab33217362bnicholes   Returns 0 if ok, -1 if input was invalid. Even if input was invalid,
9b2bd54c98edb185a3985410367754ab33217362bnicholes   as much as possible should be added to output. */
9b2bd54c98edb185a3985410367754ab33217362bnicholestypedef int normalizer_func_t(const void *input, size_t size,
9b2bd54c98edb185a3985410367754ab33217362bnicholes                  buffer_t *output);
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholesextern const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN];
9b2bd54c98edb185a3985410367754ab33217362bnicholesextern const uint8_t *const uni_utf8_non1_bytes;
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Returns number of characters in a NUL-terminated unicode string */
9b2bd54c98edb185a3985410367754ab33217362bnicholesunsigned int uni_strlen(const unichar_t *str) ATTR_PURE;
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was
9b2bd54c98edb185a3985410367754ab33217362bnicholes   invalid */
9b2bd54c98edb185a3985410367754ab33217362bnicholesint uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output);
9b2bd54c98edb185a3985410367754ab33217362bnicholesint uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
9b2bd54c98edb185a3985410367754ab33217362bnicholes               ARRAY_TYPE(unichars) *output);
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Translates UCS-4 input to UTF-8 output. */
9b2bd54c98edb185a3985410367754ab33217362bnicholesvoid uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output);
9b2bd54c98edb185a3985410367754ab33217362bnicholesvoid uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output);
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Returns 1 if *chr_r is set, 0 for incomplete trailing character,
9b2bd54c98edb185a3985410367754ab33217362bnicholes   -1 for invalid input. */
be06f010941d039a422f1bbd7e321a1a0e92e291bnicholesint uni_utf8_get_char(const char *input, unichar_t *chr_r);
9b2bd54c98edb185a3985410367754ab33217362bnicholesint uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r);
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Returns UTF-8 string length. */
9b2bd54c98edb185a3985410367754ab33217362bnicholesunsigned int uni_utf8_strlen(const char *input) ATTR_PURE;
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Returns UTF-8 string length with maximum input size. */
9b2bd54c98edb185a3985410367754ab33217362bnicholesunsigned int uni_utf8_strlen_n(const void *input, size_t size) ATTR_PURE;
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Same as uni_utf8_strlen_n(), but if input ends with a partial UTF-8
be06f010941d039a422f1bbd7e321a1a0e92e291bnicholes   character, don't include it in the return value and set partial_pos_r to
9b2bd54c98edb185a3985410367754ab33217362bnicholes   where the character begins. Otherwise partial_pos_r is set to the end
9b2bd54c98edb185a3985410367754ab33217362bnicholes   of the input. */
9b2bd54c98edb185a3985410367754ab33217362bnicholesunsigned int uni_utf8_partial_strlen_n(const void *input, size_t size,
9b2bd54c98edb185a3985410367754ab33217362bnicholes                       size_t *partial_pos_r);
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Returns the number of bytes belonging to this UTF-8 character. The given
be06f010941d039a422f1bbd7e321a1a0e92e291bnicholes   parameter is the first byte of the UTF-8 sequence. Invalid input is
9b2bd54c98edb185a3985410367754ab33217362bnicholes   returned with length 1. */
9b2bd54c98edb185a3985410367754ab33217362bnicholesstatic inline unsigned int ATTR_CONST
9b2bd54c98edb185a3985410367754ab33217362bnicholesuni_utf8_char_bytes(char chr)
9b2bd54c98edb185a3985410367754ab33217362bnicholes{
9b2bd54c98edb185a3985410367754ab33217362bnicholes    /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
9b2bd54c98edb185a3985410367754ab33217362bnicholes    if ((uint8_t)chr < (192 + 2))
be06f010941d039a422f1bbd7e321a1a0e92e291bnicholes        return 1;
9b2bd54c98edb185a3985410367754ab33217362bnicholes    return uni_utf8_non1_bytes[(uint8_t)chr - (192 + 2)];
9b2bd54c98edb185a3985410367754ab33217362bnicholes}
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Return given character in titlecase. */
9b2bd54c98edb185a3985410367754ab33217362bnicholesunichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
9b2bd54c98edb185a3985410367754ab33217362bnicholes   output buffer. Returns 0 if ok, -1 if input was invalid. This generates
9b2bd54c98edb185a3985410367754ab33217362bnicholes   output that's compatible with i;unicode-casemap comparator. Invalid input
9b2bd54c98edb185a3985410367754ab33217362bnicholes   is replaced with unicode replacement character (0xfffd). */
9b2bd54c98edb185a3985410367754ab33217362bnicholesint uni_utf8_to_decomposed_titlecase(const void *input, size_t size,
9b2bd54c98edb185a3985410367754ab33217362bnicholes                     buffer_t *output);
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* If input contains only valid UTF-8 characters, return TRUE without updating
9b2bd54c98edb185a3985410367754ab33217362bnicholes   buf. If input contains invalid UTF-8 characters, replace them with unicode
9b2bd54c98edb185a3985410367754ab33217362bnicholes   replacement character (0xfffd), write the output to buf and return FALSE. */
9b2bd54c98edb185a3985410367754ab33217362bnicholesbool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
9b2bd54c98edb185a3985410367754ab33217362bnicholes                 buffer_t *buf);
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Returns TRUE if string is valid UTF-8 input. */
9b2bd54c98edb185a3985410367754ab33217362bnicholesbool uni_utf8_str_is_valid(const char *str);
9b2bd54c98edb185a3985410367754ab33217362bnicholes/* Returns TRUE if data contains only valid UTF-8 input. */
ac7985784d08a3655291f24f711812b4d8b1cbcffuankgbool uni_utf8_data_is_valid(const unsigned char *data, size_t size);
9b2bd54c98edb185a3985410367754ab33217362bnicholes
9b2bd54c98edb185a3985410367754ab33217362bnicholes#endif
9b2bd54c98edb185a3985410367754ab33217362bnicholes