unichar.h revision 7aa59f55d8a4e02c7039fbd22660c4055bfc8393
c25356d5978632df6203437e1953bcb29e0c736fTimo Sirainen#ifndef UNICHAR_H
c25356d5978632df6203437e1953bcb29e0c736fTimo Sirainen#define UNICHAR_H
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainentypedef uint32_t unichar_t;
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenextern const uint8_t *const uni_utf8_non1_bytes;
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen/* Returns number of characters in a NUL-terminated unicode string */
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainenunsigned int uni_strlen(const unichar_t *str);
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen/* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen invalid */
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainenint uni_utf8_to_ucs4(const char *input, buffer_t *output);
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen/* Translates UCS-4 input to UTF-8 output. */
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainenvoid uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output);
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenvoid uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output);
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen/* Returns 1 if *chr_r is set, 0 for incomplete trailing character,
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen -1 for invalid input. */
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenint uni_utf8_get_char(const char *input, unichar_t *chr_r);
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenint uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r);
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen/* Returns UTF-8 string length with maximum input size. */
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainenunsigned int uni_utf8_strlen_n(const void *input, size_t size);
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen/* Returns the number of bytes belonging to this partial UTF-8 character.
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen Invalid input is returned with length 1. */
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainenstatic inline unsigned int uni_utf8_char_bytes(char chr)
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen{
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen if ((uint8_t)chr < (192 + 2))
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen return 1;
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen return uni_utf8_non1_bytes[(uint8_t)chr - (192 + 2)];
2a7605bb97dc9ed8accf2537fad1073a5fc5ff48Timo Sirainen}
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen/* Return given character in titlecase. */
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainenunichar_t uni_ucs4_to_titlecase(unichar_t chr);
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen output buffer. Returns 0 if ok, -1 if input was invalid. This generates
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen output that's compatible with i;unicode-casemap comparator. */
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainenint uni_utf8_to_decomposed_titlecase(const void *input, size_t max_len,
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen buffer_t *output);
0ddb604f911b908085ef787455c015a91dc9c365Timo Sirainen
7aa59f55d8a4e02c7039fbd22660c4055bfc8393Timo Sirainen/* If input contains only valid UTF-8 characters, return TRUE. If input
7aa59f55d8a4e02c7039fbd22660c4055bfc8393Timo Sirainen contains invalid UTF-8 characters, write only the valid ones to buf and
7aa59f55d8a4e02c7039fbd22660c4055bfc8393Timo Sirainen return FALSE. */
7aa59f55d8a4e02c7039fbd22660c4055bfc8393Timo Sirainenbool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
7aa59f55d8a4e02c7039fbd22660c4055bfc8393Timo Sirainen buffer_t *buf);
511ba4416aafb9f9ba1a4193703b95a033267068Timo Sirainen
aa883f5fbc68920c48c4f52919e8a5bb9611e678Timo Sirainen#endif