unichar.c revision f7fd9747a69216e36fb2615dc74b8728c9484503
02c335c23bf5fa225a467c19f2c063fb0dc7b8c3Timo Sirainen/* Copyright (c) 2005-2007 Dovecot authors, see the included COPYING file */
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch#include "lib.h"
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch#include "buffer.h"
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch#include "bsearch-insert-pos.h"
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch#include "unichar.h"
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
a991cfe2157e58ee43bc580f517ce9ef0dfb7acfStephan Bosch#include "unicodemap.c"
de0181258ab66b527ad8dc7e51a8efa76b4658d0Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch#define HANGUL_FIRST 0xac00
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch#define HANGUL_LAST 0xd7a3
0d5c9a80e91a4073d5fd6820e9ddce2755221f64Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Boschstatic const uint8_t utf8_non1_bytes[256 - 192 - 2] = {
6d573191bea1a64d6046be070487a5705a2d0204Stephan Bosch 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
30f35cf5d1e1374d7fab4231e86144fc106a8e79Stephan Bosch 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch};
65c0e43da8cfc730eeb4634f8aa384081bbfa4e7Timo Sirainen
7384b4e78eaab44693c985192276e31322155e32Stephan Boschconst uint8_t *const uni_utf8_non1_bytes = utf8_non1_bytes;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Boschunsigned int uni_strlen(const unichar_t *str)
1d048c5050f03c24251e5af8087e640de21b2d62Timo Sirainen{
1d048c5050f03c24251e5af8087e640de21b2d62Timo Sirainen unsigned int len = 0;
1d048c5050f03c24251e5af8087e640de21b2d62Timo Sirainen
1d048c5050f03c24251e5af8087e640de21b2d62Timo Sirainen for (len = 0; str[len] != 0; len++) ;
1d048c5050f03c24251e5af8087e640de21b2d62Timo Sirainen
1d048c5050f03c24251e5af8087e640de21b2d62Timo Sirainen return len;
1d048c5050f03c24251e5af8087e640de21b2d62Timo Sirainen}
1d048c5050f03c24251e5af8087e640de21b2d62Timo Sirainen
1d048c5050f03c24251e5af8087e640de21b2d62Timo Sirainenint uni_utf8_get_char(const char *input, unichar_t *chr_r)
1d048c5050f03c24251e5af8087e640de21b2d62Timo Sirainen{
1d048c5050f03c24251e5af8087e640de21b2d62Timo Sirainen return uni_utf8_get_char_n((const unsigned char *)input, (size_t)-1,
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch chr_r);
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch}
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Boschint uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r)
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch{
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch const unsigned char *input = _input;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch unichar_t chr;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch unsigned int i, len;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch int ret;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch i_assert(max_len > 0);
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch if (*input < 0x80) {
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch *chr_r = *input;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch return 1;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch }
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch /* first byte has len highest bits set, followed by zero bit.
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch the rest of the bits are used as the highest bits of the value. */
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch chr = *input;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch len = uni_utf8_char_bytes(*input);
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch switch (len) {
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch case 2:
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch chr &= 0x1f;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch break;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch case 3:
9145c8b5eda526d05bd4a7ced20f6f6f2ff8df03Stephan Bosch chr &= 0x0f;
9145c8b5eda526d05bd4a7ced20f6f6f2ff8df03Stephan Bosch break;
9145c8b5eda526d05bd4a7ced20f6f6f2ff8df03Stephan Bosch case 4:
9145c8b5eda526d05bd4a7ced20f6f6f2ff8df03Stephan Bosch chr &= 0x07;
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch break;
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch case 5:
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch chr &= 0x03;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch break;
47a53a80656dc400ff8effdc1432a69fbf5ae8baTimo Sirainen case 6:
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch chr &= 0x01;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch break;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch default:
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch /* only 7bit chars should have len==1 */
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch i_assert(len == 1);
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch return -1;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch }
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
47a53a80656dc400ff8effdc1432a69fbf5ae8baTimo Sirainen if (len <= max_len)
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch ret = 1;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch else {
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch /* check first if the input is invalid before returning 0 */
6d573191bea1a64d6046be070487a5705a2d0204Stephan Bosch ret = 0;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch len = max_len;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch }
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch /* the following bytes must all be 10xxxxxx */
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch for (i = 1; i < len; i++) {
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch if ((input[i] & 0xc0) != 0x80)
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch return input[i] == '\0' ? 0 : -1;
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch chr <<= 6;
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch chr |= input[i] & 0x3f;
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch }
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch *chr_r = chr;
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch return ret;
f74dbd3ff682fea040f60383e001620d1f1b09d3Stephan Bosch}
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Boschint uni_utf8_to_ucs4(const char *input, buffer_t *output)
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch{
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch unichar_t chr;
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch while (*input != '\0') {
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch if (uni_utf8_get_char(input, &chr) <= 0) {
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch /* invalid input */
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch return -1;
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch }
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch input += uni_utf8_char_bytes(*input);
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch buffer_append(output, &chr, sizeof(chr));
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch }
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch return 0;
30f35cf5d1e1374d7fab4231e86144fc106a8e79Stephan Bosch}
30f35cf5d1e1374d7fab4231e86144fc106a8e79Stephan Bosch
30f35cf5d1e1374d7fab4231e86144fc106a8e79Stephan Boschvoid uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output)
30f35cf5d1e1374d7fab4231e86144fc106a8e79Stephan Bosch{
30f35cf5d1e1374d7fab4231e86144fc106a8e79Stephan Bosch for (; *input != '\0' && len > 0; input++, len--)
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch uni_ucs4_to_utf8_c(*input, output);
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch}
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Boschvoid uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output)
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch{
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch unsigned char first;
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch int bitpos;
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch if (chr < 0x80) {
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch buffer_append_c(output, chr);
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch return;
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch }
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch
f74dbd3ff682fea040f60383e001620d1f1b09d3Stephan Bosch i_assert(chr < 0x80000000); /* 1 << (5*6 + 1) */
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch if (chr < (1 << (6 + 5))) {
f74dbd3ff682fea040f60383e001620d1f1b09d3Stephan Bosch /* 110xxxxx */
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch bitpos = 6;
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch first = 0x80 | 0x40;
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch } else if (chr < (1 << ((2*6) + 4))) {
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch /* 1110xxxx */
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch bitpos = 2*6;
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch first = 0x80 | 0x40 | 0x20;
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch } else if (chr < (1 << ((3*6) + 3))) {
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch /* 11110xxx */
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch bitpos = 3*6;
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch first = 0x80 | 0x40 | 0x20 | 0x10;
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch } else if (chr < (1 << ((4*6) + 2))) {
f883bf3eff62f5d27df5ee9ee664edc38a77937fStephan Bosch /* 111110xx */
f883bf3eff62f5d27df5ee9ee664edc38a77937fStephan Bosch bitpos = 4*6;
f883bf3eff62f5d27df5ee9ee664edc38a77937fStephan Bosch first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08;
f883bf3eff62f5d27df5ee9ee664edc38a77937fStephan Bosch } else {
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch /* 1111110x */
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch bitpos = 5*6;
e47c2f17d8136c4d972d1074a3f84ba2ecef4fdcStephan Bosch first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08 | 0x04;
f74dbd3ff682fea040f60383e001620d1f1b09d3Stephan Bosch }
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch buffer_append_c(output, first | (chr >> bitpos));
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch do {
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch bitpos -= 6;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch buffer_append_c(output, 0x80 | ((chr >> bitpos) & 0x3f));
e9228a3918aa0243eff4aae1ff5462bd3198417fTimo Sirainen } while (bitpos > 0);
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch}
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Boschunsigned int uni_utf8_strlen_n(const void *_input, size_t size)
d1f964d3f1dd9c5868b134c4f44dd63f3722eef7Timo Sirainen{
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch const unsigned char *input = _input;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch unsigned int len = 0;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch size_t i;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch for (i = 0; i < size && input[i] != '\0'; ) {
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch i += uni_utf8_char_bytes(input[i]);
d1f964d3f1dd9c5868b134c4f44dd63f3722eef7Timo Sirainen if (i > size)
d1f964d3f1dd9c5868b134c4f44dd63f3722eef7Timo Sirainen break;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch len++;
d1f964d3f1dd9c5868b134c4f44dd63f3722eef7Timo Sirainen }
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch return len;
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch}
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch
79f8a20424633e806447bc9375a5ab403aabc758Stephan Boschstatic bool uint16_find(const uint16_t *data, unsigned int count,
b66def5dadd3e7c250313a938d26ad113663f86bStephan Bosch uint16_t value, unsigned int *idx_r)
b66def5dadd3e7c250313a938d26ad113663f86bStephan Bosch{
b66def5dadd3e7c250313a938d26ad113663f86bStephan Bosch BINARY_NUMBER_SEARCH(data, count, value, idx_r);
b66def5dadd3e7c250313a938d26ad113663f86bStephan Bosch}
b66def5dadd3e7c250313a938d26ad113663f86bStephan Bosch
b66def5dadd3e7c250313a938d26ad113663f86bStephan Boschstatic bool uint32_find(const uint32_t *data, unsigned int count,
fe681e6db72f30bd754b622005bbe298e5ca775aTimo Sirainen uint32_t value, unsigned int *idx_r)
fe681e6db72f30bd754b622005bbe298e5ca775aTimo Sirainen{
fe681e6db72f30bd754b622005bbe298e5ca775aTimo Sirainen BINARY_NUMBER_SEARCH(data, count, value, idx_r);
fe681e6db72f30bd754b622005bbe298e5ca775aTimo Sirainen}
fe681e6db72f30bd754b622005bbe298e5ca775aTimo Sirainen
7384b4e78eaab44693c985192276e31322155e32Stephan Boschunichar_t uni_ucs4_to_titlecase(unichar_t chr)
a991cfe2157e58ee43bc580f517ce9ef0dfb7acfStephan Bosch{
a991cfe2157e58ee43bc580f517ce9ef0dfb7acfStephan Bosch unsigned int idx;
a991cfe2157e58ee43bc580f517ce9ef0dfb7acfStephan Bosch
a991cfe2157e58ee43bc580f517ce9ef0dfb7acfStephan Bosch if (chr <= 0xffff) {
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch if (!uint16_find(titlecase16_keys, N_ELEMENTS(titlecase16_keys),
a991cfe2157e58ee43bc580f517ce9ef0dfb7acfStephan Bosch chr, &idx))
fb1be3de0159d6a10e916ad992e2bc53be64c6d5Timo Sirainen return chr;
fb1be3de0159d6a10e916ad992e2bc53be64c6d5Timo Sirainen else
129596c93692b21d6c6b1313b389774af24c2983Stephan Bosch return titlecase16_values[idx];
fca68889b287d8eed4babe72a231bd6079da012dStephan Bosch } else {
65c0e43da8cfc730eeb4634f8aa384081bbfa4e7Timo Sirainen if (!uint32_find(titlecase32_keys, N_ELEMENTS(titlecase32_keys),
65c0e43da8cfc730eeb4634f8aa384081bbfa4e7Timo Sirainen chr, &idx))
65c0e43da8cfc730eeb4634f8aa384081bbfa4e7Timo Sirainen return chr;
65c0e43da8cfc730eeb4634f8aa384081bbfa4e7Timo Sirainen else
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch return titlecase32_values[idx];
a62fe4b300e2f591e939993aec4cac1e7ae30ad1Stephan Bosch }
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch}
d1f964d3f1dd9c5868b134c4f44dd63f3722eef7Timo Sirainen
7384b4e78eaab44693c985192276e31322155e32Stephan Boschstatic bool uni_ucs4_decompose_uni(unichar_t *chr)
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch{
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch unsigned int idx;
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch if (*chr <= 0xffff) {
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch if (!uint16_find(uni16_decomp_keys,
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch N_ELEMENTS(uni16_decomp_keys),
d1f964d3f1dd9c5868b134c4f44dd63f3722eef7Timo Sirainen *chr, &idx))
d1f964d3f1dd9c5868b134c4f44dd63f3722eef7Timo Sirainen return FALSE;
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch *chr = uni16_decomp_values[idx];
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch } else {
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch if (!uint32_find(uni32_decomp_keys,
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch N_ELEMENTS(uni32_decomp_keys),
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch *chr, &idx))
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch return FALSE;
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch *chr = uni32_decomp_values[idx];
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch }
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch return TRUE;
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch}
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch
79f8a20424633e806447bc9375a5ab403aabc758Stephan Boschstatic void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output)
d1f964d3f1dd9c5868b134c4f44dd63f3722eef7Timo Sirainen{
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch#define SBase HANGUL_FIRST
79f8a20424633e806447bc9375a5ab403aabc758Stephan Bosch#define LBase 0x1100
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch#define VBase 0x1161
fc94140acba51adafedafbc8491a3223a51db7a8Stephan Bosch#define TBase 0x11A7
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch#define LCount 19
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch#define VCount 21
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch#define TCount 28
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch#define NCount (VCount * TCount)
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch unsigned int SIndex = chr - SBase;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch unichar_t L = LBase + SIndex / NCount;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch unichar_t V = VBase + (SIndex % NCount) / TCount;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch unichar_t T = TBase + SIndex % TCount;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch uni_ucs4_to_utf8_c(L, output);
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch uni_ucs4_to_utf8_c(V, output);
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch if (T != TBase) uni_ucs4_to_utf8_c(T, output);
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch}
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Boschstatic bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output)
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch{
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch const uint16_t *value;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch unsigned int idx;
93c9909f68f7d057e38cca3b4612ec8d0bf42999Timo Sirainen
93c9909f68f7d057e38cca3b4612ec8d0bf42999Timo Sirainen if (chr > 0xffff)
93c9909f68f7d057e38cca3b4612ec8d0bf42999Timo Sirainen return FALSE;
93c9909f68f7d057e38cca3b4612ec8d0bf42999Timo Sirainen
93c9909f68f7d057e38cca3b4612ec8d0bf42999Timo Sirainen if (!uint16_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys),
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch chr, &idx))
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch return FALSE;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
49287618521ff2c69385456de116e5d1581426c0Timo Sirainen value = &multidecomp_values[multidecomp_offsets[idx]];
49287618521ff2c69385456de116e5d1581426c0Timo Sirainen for (; *value != 0; value++)
49287618521ff2c69385456de116e5d1581426c0Timo Sirainen uni_ucs4_to_utf8_c(*value, output);
49287618521ff2c69385456de116e5d1581426c0Timo Sirainen return TRUE;
49287618521ff2c69385456de116e5d1581426c0Timo Sirainen}
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Boschint uni_utf8_to_decomposed_titlecase(const void *_input, size_t max_len,
30f35cf5d1e1374d7fab4231e86144fc106a8e79Stephan Bosch buffer_t *output)
30f35cf5d1e1374d7fab4231e86144fc106a8e79Stephan Bosch{
30f35cf5d1e1374d7fab4231e86144fc106a8e79Stephan Bosch const unsigned char *input = _input;
30f35cf5d1e1374d7fab4231e86144fc106a8e79Stephan Bosch unsigned int bytes;
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch unichar_t chr;
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch int ret = 0;
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch while (max_len > 0 && *input != '\0') {
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch if (uni_utf8_get_char_n(input, max_len, &chr) <= 0) {
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch /* invalid input. try the next byte. */
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch ret = -1;
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch input++; max_len--;
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch continue;
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch }
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch bytes = uni_utf8_char_bytes(*input);
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch input += bytes;
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch max_len -= bytes;
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch chr = uni_ucs4_to_titlecase(chr);
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST)
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch uni_ucs4_decompose_hangul_utf8(chr, output);
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch else if (uni_ucs4_decompose_uni(&chr) ||
4c4c4a740bbb1b674d4b0dae009d1919f8ad96b7Stephan Bosch !uni_ucs4_decompose_multi_utf8(chr, output))
4c4c4a740bbb1b674d4b0dae009d1919f8ad96b7Stephan Bosch uni_ucs4_to_utf8_c(chr, output);
4c4c4a740bbb1b674d4b0dae009d1919f8ad96b7Stephan Bosch }
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch return ret;
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch}
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Boschstatic inline unsigned int
b99130e4cf4af4e6b103b949456222f3a2dff424Timo Sirainenis_valid_utf8_seq(const unsigned char *input, unsigned int size)
b99130e4cf4af4e6b103b949456222f3a2dff424Timo Sirainen{
b99130e4cf4af4e6b103b949456222f3a2dff424Timo Sirainen size_t i, len;
b99130e4cf4af4e6b103b949456222f3a2dff424Timo Sirainen
87c121a4c05b9cee46f1f757ec6999d441519abfStephan Bosch len = uni_utf8_char_bytes(input[0]);
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch if (unlikely(len > size))
4219de12b28f1936219e27501b9c4b27a4f8d53cStephan Bosch return 0;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch for (i = 0; i < len; i++) {
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch if (unlikely(uni_utf8_char_bytes(input[i]) != len-i))
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen return 0;
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen }
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen return len;
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen}
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainenbool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen buffer_t *buf)
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen{
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen size_t i, len;
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen /* find the first invalid utf8 sequence */
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen for (i = 0; i < size;) {
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen if (input[i] < 0x80)
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen i++;
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen else {
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen len = is_valid_utf8_seq(input + i, size-i);
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen if (unlikely(len == 0))
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen goto broken;
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen i += len;
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen }
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen }
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen return TRUE;
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainenbroken:
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen /* broken utf-8 input - skip the broken characters */
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen buffer_append(buf, input, i++);
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen while (i < size) {
84740b03d3ee9e96a2e446a54729188764c99292Timo Sirainen if (input[i] < 0x80) {
6d573191bea1a64d6046be070487a5705a2d0204Stephan Bosch buffer_append_c(buf, input[i++]);
6d573191bea1a64d6046be070487a5705a2d0204Stephan Bosch continue;
6d573191bea1a64d6046be070487a5705a2d0204Stephan Bosch }
6d573191bea1a64d6046be070487a5705a2d0204Stephan Bosch
6d573191bea1a64d6046be070487a5705a2d0204Stephan Bosch len = is_valid_utf8_seq(input + i, size-i);
6d573191bea1a64d6046be070487a5705a2d0204Stephan Bosch if (len == 0) {
6d573191bea1a64d6046be070487a5705a2d0204Stephan Bosch i++;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch continue;
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch }
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch buffer_append(buf, input + i, len);
4521d35c263add6af3f1ae55b3760291767ce50cTimo Sirainen i += len;
4521d35c263add6af3f1ae55b3760291767ce50cTimo Sirainen }
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch return FALSE;
65c0e43da8cfc730eeb4634f8aa384081bbfa4e7Timo Sirainen}
7384b4e78eaab44693c985192276e31322155e32Stephan Bosch