bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen#include "lib.h"
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila#include "mempool.h"
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen#include "array.h"
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila#include "str.h"
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen#include "unichar.h"
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen#include "fts-icu.h"
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen#include <unicode/uchar.h>
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen#include <unicode/ucasemap.h>
d3acad538059ba27f269d390516be7e9fb44294fTimo Sirainen#include <unicode/uclean.h>
d3acad538059ba27f269d390516be7e9fb44294fTimo Sirainen
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainenstatic struct UCaseMap *icu_csm = NULL;
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainenstatic struct UCaseMap *fts_icu_csm(void)
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen{
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen UErrorCode err = U_ZERO_ERROR;
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen if (icu_csm != NULL)
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen return icu_csm;
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen icu_csm = ucasemap_open(NULL, U_FOLD_CASE_DEFAULT, &err);
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen if (U_FAILURE(err)) {
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen i_fatal("LibICU ucasemap_open() failed: %s",
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen u_errorName(err));
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen }
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen return icu_csm;
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen}
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainenvoid fts_icu_utf8_to_utf16(ARRAY_TYPE(icu_utf16) *dest_utf16,
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen const char *src_utf8)
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen{
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen buffer_t *dest_buf = dest_utf16->arr.buffer;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen UErrorCode err = U_ZERO_ERROR;
2ac5f36aa7c2e7a07ba8815d43a6d7483f62e74cTimo Sirainen size_t src_bytes = strlen(src_utf8);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen int32_t utf16_len;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen UChar *dest_data, *retp = NULL;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen int32_t avail_uchars = 0;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen /* try to encode with the current buffer size */
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen avail_uchars = buffer_get_writable_size(dest_buf) / sizeof(UChar);
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen dest_data = buffer_get_space_unsafe(dest_buf, 0,
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen buffer_get_writable_size(dest_buf));
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen retp = u_strFromUTF8Lenient(dest_data, avail_uchars,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen &utf16_len, src_utf8, src_bytes, &err);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen if (err == U_BUFFER_OVERFLOW_ERROR) {
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen /* try again with a larger buffer */
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen dest_data = buffer_get_space_unsafe(dest_buf, 0,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen utf16_len * sizeof(UChar));
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen err = U_ZERO_ERROR;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen retp = u_strFromUTF8Lenient(dest_data, utf16_len,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen &utf16_len, src_utf8,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen src_bytes, &err);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen }
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen if (U_FAILURE(err)) {
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen i_panic("LibICU u_strFromUTF8Lenient() failed: %s",
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen u_errorName(err));
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen }
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen buffer_set_used_size(dest_buf, utf16_len * sizeof(UChar));
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen i_assert(retp == dest_data);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen}
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainenvoid fts_icu_utf16_to_utf8(string_t *dest_utf8, const UChar *src_utf16,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen unsigned int src_len)
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen{
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen int32_t dest_len = 0;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen int32_t sub_num = 0;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen char *dest_data, *retp = NULL;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen UErrorCode err = U_ZERO_ERROR;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen /* try to encode with the current buffer size */
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen dest_data = buffer_get_space_unsafe(dest_utf8, 0,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen buffer_get_writable_size(dest_utf8));
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen retp = u_strToUTF8WithSub(dest_data, buffer_get_writable_size(dest_utf8),
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen &dest_len, src_utf16, src_len,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen UNICODE_REPLACEMENT_CHAR, &sub_num, &err);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen if (err == U_BUFFER_OVERFLOW_ERROR) {
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen /* try again with a larger buffer */
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen dest_data = buffer_get_space_unsafe(dest_utf8, 0, dest_len);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen err = U_ZERO_ERROR;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen retp = u_strToUTF8WithSub(dest_data, buffer_get_writable_size(dest_utf8), &dest_len,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen src_utf16, src_len,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen UNICODE_REPLACEMENT_CHAR,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen &sub_num, &err);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen }
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen if (U_FAILURE(err)) {
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen i_panic("LibICU u_strToUTF8WithSub() failed: %s",
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen u_errorName(err));
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen }
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen buffer_set_used_size(dest_utf8, dest_len);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen i_assert(retp == dest_data);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen}
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainenint fts_icu_translate(ARRAY_TYPE(icu_utf16) *dest_utf16, const UChar *src_utf16,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen unsigned int src_len, UTransliterator *transliterator,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen const char **error_r)
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen{
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen buffer_t *dest_buf = dest_utf16->arr.buffer;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen UErrorCode err = U_ZERO_ERROR;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen int32_t utf16_len = src_len;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen UChar *dest_data;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen int32_t avail_uchars, limit = src_len;
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen size_t dest_pos = dest_buf->used;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen /* translation is done in-place in the buffer. try first with the
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen current buffer size. */
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen array_append(dest_utf16, src_utf16, src_len);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen avail_uchars = (buffer_get_writable_size(dest_buf)-dest_pos) / sizeof(UChar);
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen dest_data = buffer_get_space_unsafe(dest_buf, dest_pos,
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen buffer_get_writable_size(dest_buf) - dest_pos);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen utrans_transUChars(transliterator, dest_data, &utf16_len,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen avail_uchars, 0, &limit, &err);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen if (err == U_BUFFER_OVERFLOW_ERROR) {
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen /* try again with a larger buffer */
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen err = U_ZERO_ERROR;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen avail_uchars = utf16_len;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen limit = utf16_len = src_len;
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen buffer_write(dest_buf, dest_pos,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen src_utf16, src_len*sizeof(UChar));
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen dest_data = buffer_get_space_unsafe(dest_buf, dest_pos,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen avail_uchars * sizeof(UChar));
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen utrans_transUChars(transliterator, dest_data, &utf16_len,
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen avail_uchars, 0, &limit, &err);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen i_assert(err != U_BUFFER_OVERFLOW_ERROR);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen }
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen if (U_FAILURE(err)) {
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen *error_r = t_strdup_printf("LibICU utrans_transUChars() failed: %s",
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen u_errorName(err));
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen buffer_set_used_size(dest_buf, dest_pos);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen return -1;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen }
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen buffer_set_used_size(dest_buf, utf16_len * sizeof(UChar));
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen return 0;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen}
d3acad538059ba27f269d390516be7e9fb44294fTimo Sirainen
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainenvoid fts_icu_lcase(string_t *dest_utf8, const char *src_utf8)
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen{
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen struct UCaseMap *csm = fts_icu_csm();
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen size_t avail_bytes, dest_pos = dest_utf8->used;
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen char *dest_data;
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen int dest_full_len;
7b3f0e0a464ec0632b9a831b58bce16c2f85ea1aTimo Sirainen UErrorCode err = U_ZERO_ERROR;
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen avail_bytes = buffer_get_writable_size(dest_utf8) - dest_pos;
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen dest_data = buffer_get_space_unsafe(dest_utf8, dest_pos, avail_bytes);
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen
c25abb52b05f4a7e380134ac21e7525d0b46187aTimo Sirainen /* ucasemap_utf8ToLower() may need to be called multiple times, because
c25abb52b05f4a7e380134ac21e7525d0b46187aTimo Sirainen the first return value may not be large enough. */
c25abb52b05f4a7e380134ac21e7525d0b46187aTimo Sirainen for (unsigned int i = 0;; i++) {
c25abb52b05f4a7e380134ac21e7525d0b46187aTimo Sirainen dest_full_len = ucasemap_utf8ToLower(csm, dest_data, avail_bytes,
c25abb52b05f4a7e380134ac21e7525d0b46187aTimo Sirainen src_utf8, -1, &err);
c25abb52b05f4a7e380134ac21e7525d0b46187aTimo Sirainen if (err != U_BUFFER_OVERFLOW_ERROR || i == 2)
c25abb52b05f4a7e380134ac21e7525d0b46187aTimo Sirainen break;
c25abb52b05f4a7e380134ac21e7525d0b46187aTimo Sirainen
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen err = U_ZERO_ERROR;
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen dest_data = buffer_get_space_unsafe(dest_utf8, dest_pos, dest_full_len);
c25abb52b05f4a7e380134ac21e7525d0b46187aTimo Sirainen avail_bytes = dest_full_len;
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen }
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen if (U_FAILURE(err)) {
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen i_fatal("LibICU ucasemap_utf8ToLower() failed: %s",
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen u_errorName(err));
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen }
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen buffer_set_used_size(dest_utf8, dest_full_len);
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen}
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen
d3acad538059ba27f269d390516be7e9fb44294fTimo Sirainenvoid fts_icu_deinit(void)
d3acad538059ba27f269d390516be7e9fb44294fTimo Sirainen{
f07e311812e0bc77760f321e3b6329ba5f41ce53Timo Sirainen if (icu_csm != NULL) {
37dfa8907f8216aa73e3880f3f98aba634458e42Timo Sirainen ucasemap_close(icu_csm);
f07e311812e0bc77760f321e3b6329ba5f41ce53Timo Sirainen icu_csm = NULL;
f07e311812e0bc77760f321e3b6329ba5f41ce53Timo Sirainen }
d3acad538059ba27f269d390516be7e9fb44294fTimo Sirainen u_cleanup();
d3acad538059ba27f269d390516be7e9fb44294fTimo Sirainen}
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovilaint fts_icu_transliterator_create(const char *id,
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila UTransliterator **transliterator_r,
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila const char **error_r)
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila{
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila UErrorCode err = U_ZERO_ERROR;
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila UParseError perr;
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen ARRAY_TYPE(icu_utf16) id_utf16;
efe78d3ba24fc866af1c79b9223dc0809ba26cadStephan Bosch i_zero(&perr);
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen t_array_init(&id_utf16, strlen(id));
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen fts_icu_utf8_to_utf16(&id_utf16, id);
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen *transliterator_r = utrans_openU(array_idx(&id_utf16, 0),
dcb783533ad1d6944db7c227cde46414a575d81cTimo Sirainen array_count(&id_utf16),
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila UTRANS_FORWARD, NULL, 0, &perr, &err);
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila if (U_FAILURE(err)) {
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila string_t *str = t_str_new(128);
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila str_printfa(str, "Failed to open transliterator for id '%s': %s",
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila id, u_errorName(err));
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila if (perr.line >= 1) {
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila /* we have only one line in our ID */
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila str_printfa(str, " (parse error on offset %u)",
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila perr.offset);
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila }
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila *error_r = str_c(str);
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila return -1;
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila }
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila return 0;
7c1ce38a29ecdc17480aacb5bac184f42ac05786Teemu Huovila}