bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "lib.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "buffer.h"
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila#include "str.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "unichar.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "bsearch-insert-pos.h"
5a2910119ec0b878a0d7ca91918b97e9d40a936dTimo Sirainen#include "fts-common.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "fts-tokenizer-private.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "fts-tokenizer-generic-private.h"
00544ad37ece26b2c4f2210ed5e5295241d0db19Teemu Huovila#include "fts-tokenizer-common.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "word-boundary-data.c"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "word-break-data.c"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila#define FTS_WB5A_PREFIX_MAX_LENGTH 3 /* Including apostrophe */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovilastatic unsigned char fts_ascii_word_breaks[128] = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47: !"#$%&()*+,-./ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic int
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_create(const char *const *settings,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer **tokenizer_r,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char **error_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct generic_fts_tokenizer *tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
50f659bc47b06939dcb7694c928c9f8aa7f56229Teemu Huovila enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila bool wb5a = FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (i = 0; settings[i] != NULL; i += 2) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *key = settings[i], *value = settings[i+1];
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (strcmp(key, "maxlen") == 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (str_to_uint(value, &max_length) < 0 ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen max_length == 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *error_r = t_strdup_printf(
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "Invalid maxlen setting: %s", value);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
505bba1d21a851fb9e719bbf381f8be592afa50aTimo Sirainen } else if (strcmp(key, "algorithm") == 0) {
505bba1d21a851fb9e719bbf381f8be592afa50aTimo Sirainen if (strcmp(value, ALGORITHM_TR29_NAME) == 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen algo = BOUNDARY_ALGORITHM_TR29;
505bba1d21a851fb9e719bbf381f8be592afa50aTimo Sirainen else if (strcmp(value, ALGORITHM_SIMPLE_NAME) == 0)
50f659bc47b06939dcb7694c928c9f8aa7f56229Teemu Huovila ;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen else {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *error_r = t_strdup_printf(
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "Invalid algorithm: %s", value);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
0c827d2094e80ede4c089fc00260d7ffcc764636Timo Sirainen } else if (strcmp(key, "search") == 0) {
0c827d2094e80ede4c089fc00260d7ffcc764636Timo Sirainen /* tokenizing a search string -
0c827d2094e80ede4c089fc00260d7ffcc764636Timo Sirainen makes no difference to us */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila } else if (strcasecmp(key, "wb5a") == 0) {
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila if (strcasecmp(value, "no") == 0)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila wb5a = FALSE;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila else
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila wb5a = TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen } else {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *error_r = t_strdup_printf("Unknown setting: %s", key);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila if (wb5a && algo != BOUNDARY_ALGORITHM_TR29) {
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila *error_r = "Can not use WB5a for algorithms other than TR29.";
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila return -1;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila }
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok = i_new(struct generic_fts_tokenizer, 1);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (algo == BOUNDARY_ALGORITHM_TR29)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen else
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->tokenizer.v = &generic_tokenizer_vfuncs_simple;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->max_length = max_length;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->algorithm = algo;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila tok->wb5a = wb5a;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->token = buffer_create_dynamic(default_pool, 64);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *tokenizer_r = &tok->tokenizer;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_destroy(struct fts_tokenizer *_tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct generic_fts_tokenizer *tok =
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (struct generic_fts_tokenizer *)_tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_free(&tok->token);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_free(tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovilastatic bool
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilafts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const char **token_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen const unsigned char *data = tok->token->data;
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen size_t len = tok->token->used;
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen if (tok->untruncated_length <= tok->max_length) {
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen /* Remove the trailing apostrophe - it was made
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen into U+0027 earlier. There can be only a single such
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen apostrophe, because otherwise the token would have already
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen been split. We also want to remove the trailing apostrophe
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen only if it's the the last character in the nontruncated
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen token - a truncated token may end with apostrophe. */
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen if (len > 0 && data[len-1] == '\'') {
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen len--;
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen i_assert(len > 0 && data[len-1] != '\'');
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen }
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen } else {
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen fts_tokenizer_delete_trailing_partial_char(data, &len);
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen }
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen i_assert(len <= tok->max_length);
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen *token_r = len == 0 ? "" :
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen t_strndup(tok->token->data, len);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_set_used_size(tok->token, 0);
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen tok->untruncated_length = 0;
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen tok->prev_letter = LETTER_TYPE_NONE;
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen return len > 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool uint32_find(const uint32_t *data, unsigned int count,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen uint32_t value, unsigned int *idx_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen BINARY_NUMBER_SEARCH(data, count, value, idx_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovilastatic bool fts_uni_word_break(unichar_t c)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int idx;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* Unicode General Punctuation, including deprecated characters. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (c >= 0x2000 && c <= 0x206f)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* From word-break-data.c, which is generated from PropList.txt. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila if (uint32_find(Quotation_Mark, N_ELEMENTS(Quotation_Mark), c, &idx))
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Pattern_White_Space, N_ELEMENTS(Pattern_White_Space), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainenstatic inline bool
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainenfts_simple_is_word_break(struct generic_fts_tokenizer *tok,
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen unichar_t c, bool apostrophe)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen if (apostrophe)
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen return tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE;
dfc9cfd5b80c8a4240841e12425eb23636ce674eTimo Sirainen else if (c < 0x80)
dfc9cfd5b80c8a4240841e12425eb23636ce674eTimo Sirainen return fts_ascii_word_breaks[c] != 0;
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen else
dfc9cfd5b80c8a4240841e12425eb23636ce674eTimo Sirainen return fts_uni_word_break(c);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainenstatic void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen{
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen struct generic_fts_tokenizer *tok =
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen (struct generic_fts_tokenizer *)_tok;
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen tok->prev_letter = LETTER_TYPE_NONE;
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen tok->prev_prev_letter = LETTER_TYPE_NONE;
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen tok->untruncated_length = 0;
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen buffer_set_used_size(tok->token, 0);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen}
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainenstatic void tok_append_truncated(struct generic_fts_tokenizer *tok,
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen const unsigned char *data, size_t size)
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen{
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen buffer_append(tok->token, data,
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen I_MIN(size, tok->max_length - tok->token->used));
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen tok->untruncated_length += size;
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen}
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilastatic int
9c34a19b60871463270e61a38aae8050a7633513Teemu Huovilafts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const unsigned char *data, size_t size,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen size_t *skip_r, const char **token_r,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const char **error_r ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct generic_fts_tokenizer *tok =
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (struct generic_fts_tokenizer *)_tok;
8acf0a8559c8e969a9079e65bf021a89cebaf10aTimo Sirainen size_t i, start = 0;
cf755ce29d2c2499b0da66868c3b15840078d0baTimo Sirainen int char_size;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila unichar_t c;
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen bool apostrophe;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila for (i = 0; i < size; i += char_size) {
cf755ce29d2c2499b0da66868c3b15840078d0baTimo Sirainen char_size = uni_utf8_get_char_n(data + i, size - i, &c);
cf755ce29d2c2499b0da66868c3b15840078d0baTimo Sirainen i_assert(char_size > 0);
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen apostrophe = IS_APOSTROPHE(c);
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen if (fts_simple_is_word_break(tok, c, apostrophe)) {
8acf0a8559c8e969a9079e65bf021a89cebaf10aTimo Sirainen tok_append_truncated(tok, data + start, i - start);
3fe4e251c34ba63c4b50df72813e2781dccb562eTimo Sirainen if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila *skip_r = i + char_size;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila return 1;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila }
72c4ef3b44c50c662b37bba93b463b0caeb63a4fTimo Sirainen start = i + char_size;
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen /* it doesn't actually matter at this point how whether
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen subsequent apostrophes are handled by prefix
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen skipping or by ignoring empty tokens - they will be
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen dropped in any case. */
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen tok->prev_letter = LETTER_TYPE_NONE;
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen } else if (apostrophe) {
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen /* all apostrophes require special handling */
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen const unsigned char apostrophe_char = '\'';
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen tok_append_truncated(tok, data + start, i - start);
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen if (tok->token->used > 0)
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen tok_append_truncated(tok, &apostrophe_char, 1);
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen start = i + char_size;
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen tok->prev_letter = LETTER_TYPE_SINGLE_QUOTE;
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen } else {
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen tok->prev_letter = LETTER_TYPE_NONE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* word boundary not found yet */
8acf0a8559c8e969a9079e65bf021a89cebaf10aTimo Sirainen tok_append_truncated(tok, data + start, i - start);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *skip_r = i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila /* return the last token */
3fe4e251c34ba63c4b50df72813e2781dccb562eTimo Sirainen if (size == 0) {
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila if (fts_tokenizer_generic_simple_current_token(tok, token_r))
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila return 1;
56a21d5d4ff6e1e7b70425b6680bb3626c4ce1ddTimo Sirainen }
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
211c638d81d382517d196ad47565e0d85012c927klemens/* TODO: Arrange array searches roughly in order of likelihood of a match.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Make some array of the arrays, so this can be a foreach loop.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Check for Hangul.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen U+058A ( ÖŠ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen HYPHEN.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen*/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic enum letter_type letter_type(unichar_t c)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int idx;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila if (IS_APOSTROPHE(c))
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila return LETTER_TYPE_APOSTROPHE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(CR, N_ELEMENTS(CR), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_CR;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(LF, N_ELEMENTS(LF), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_LF;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Newline, N_ELEMENTS(Newline), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_NEWLINE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Extend, N_ELEMENTS(Extend), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_EXTEND;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Regional_Indicator, N_ELEMENTS(Regional_Indicator), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_REGIONAL_INDICATOR;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Format, N_ELEMENTS(Format), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_FORMAT;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Katakana, N_ELEMENTS(Katakana), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_KATAKANA;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Hebrew_Letter, N_ELEMENTS(Hebrew_Letter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_HEBREW_LETTER;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(ALetter, N_ELEMENTS(ALetter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_ALETTER;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Single_Quote, N_ELEMENTS(Single_Quote), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_SINGLE_QUOTE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Double_Quote, N_ELEMENTS(Double_Quote), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_DOUBLE_QUOTE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidNumLet, N_ELEMENTS(MidNumLet), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_MIDNUMLET;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidLetter, N_ELEMENTS(MidLetter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_MIDLETTER;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidNum, N_ELEMENTS(MidNum), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_MIDNUM;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Numeric, N_ELEMENTS(Numeric), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_NUMERIC;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(ExtendNumLet, N_ELEMENTS(ExtendNumLet), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_EXTENDNUMLET;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_OTHER;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_panic(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_panic("Letter type should not be used.");
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* WB3, WB3a and WB3b, but really different since we try to eat
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen whitespace between words. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_cr_lf_newline(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_extend_format(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB4 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_regional_indicator(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13c */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_REGIONAL_INDICATOR)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_katakana(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_KATAKANA)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13b */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_hebrew(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB5 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila /* WB7 WB7c, except MidNumLet */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_prev_letter == LETTER_TYPE_HEBREW_LETTER &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDLETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB10 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13b */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_aletter(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila /* WB5a */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila if (IS_WB5A_APOSTROPHE(tok->prev_letter_c) && IS_VOWEL(tok->letter_c)) {
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila tok->seen_wb5a = TRUE;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila return TRUE;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila }
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB5 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila /* WB7, except MidNumLet */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_prev_letter == LETTER_TYPE_ALETTER &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDLETTER))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB10 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13b */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_single_quote(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB6 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB12 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_double_quote(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovilastatic bool letter_midnumlet(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila /* Break at MidNumLet, non-conformant with WB6/WB7 */
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_midletter(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB6 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_midnum(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB12 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_numeric(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB8 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB9 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB11 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if(tok->prev_prev_letter == LETTER_TYPE_NUMERIC &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_MIDNUM ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13b */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_extendnumlet(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13a */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_NUMERIC ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_KATAKANA ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovilastatic bool letter_apostrophe(struct generic_fts_tokenizer *tok)
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila{
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila if (tok->prev_letter == LETTER_TYPE_ALETTER ||
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila return FALSE;
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila return TRUE; /* Any / Any */
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovilastatic bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenadd_prev_letter(struct generic_fts_tokenizer *tok, enum letter_type lt)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila if(tok->prev_letter != LETTER_TYPE_NONE)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_prev_letter = tok->prev_letter;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila tok->prev_letter = lt;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila}
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovilastatic void
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovilaadd_letter_c(struct generic_fts_tokenizer *tok, unichar_t c)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila{
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila if(tok->letter_c != 0)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila tok->prev_letter_c = tok->letter_c;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila tok->letter_c = c;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/*
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Define what to skip between words.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Include double quotation marks? Messes up parsing?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Does this "reverse approach" include too much in "whitespace"?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Possibly use is_word_break()?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen */
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainenstatic bool is_nontoken(enum letter_type lt)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (lt == LETTER_TYPE_REGIONAL_INDICATOR || lt == LETTER_TYPE_KATAKANA ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lt == LETTER_TYPE_HEBREW_LETTER || lt == LETTER_TYPE_ALETTER ||
6018cfb92a352878c468fedd61c7703c4e2ea30bTeemu Huovila lt == LETTER_TYPE_NUMERIC)
6018cfb92a352878c468fedd61c7703c4e2ea30bTeemu Huovila return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* The way things are done WB6/7 and WB11/12 "false positives" can
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen leave trailing unwanted chars. They are searched for here. This is
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen very kludgy and should be coded into the rules themselves
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen somehow.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen*/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool is_one_past_end(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB6/7 false positive detected at one past end. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_MIDLETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE )
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
73e7fedf77599bb30644bd2e089ce5a8b3a65532Teemu Huovila /* WB11/12 false positive detected at one past end. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_MIDNUM ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainenstatic void
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilafts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const char **token_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen const unsigned char *data = tok->token->data;
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen size_t len = tok->token->used;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen if (is_one_past_end(tok) &&
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen tok->untruncated_length <= tok->max_length) {
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen /* delete the last character */
e8ee7a23b194516087ba27f1af09213991af0b30Timo Sirainen while (!UTF8_IS_START_SEQ(data[len-1]))
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen len--;
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen i_assert(len > 0);
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen len--;
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen } else if (tok->untruncated_length > tok->max_length) {
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen fts_tokenizer_delete_trailing_partial_char(data, &len);
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen }
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen /* we're skipping all non-token chars at the beginning of the word,
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen so by this point we must have something here - even if we just
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen deleted the last character */
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen i_assert(len > 0);
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen i_assert(len <= tok->max_length);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_prev_letter = LETTER_TYPE_NONE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter = LETTER_TYPE_NONE;
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen *token_r = t_strndup(data, len);
759c11290d4bedad20cd1e22fe1007cc0893f079Timo Sirainen buffer_set_used_size(tok->token, 0);
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen tok->untruncated_length = 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovilastatic void wb5a_reinsert(struct generic_fts_tokenizer *tok)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila{
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila string_t *utf8_str = t_str_new(6);
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila uni_ucs4_to_utf8_c(tok->letter_c, utf8_str);
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila buffer_insert(tok->token, 0, str_data(utf8_str), str_len(utf8_str));
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila tok->prev_letter = letter_type(tok->letter_c);
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila tok->letter_c = 0;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila tok->prev_letter_c = 0;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila tok->seen_wb5a = FALSE;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila}
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainenstruct letter_fn {
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen bool (*fn)(struct generic_fts_tokenizer *tok);
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen};
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainenstatic struct letter_fn letter_fns[] = {
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_panic}, {letter_cr_lf_newline}, {letter_cr_lf_newline},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_cr_lf_newline}, {letter_extend_format},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_regional_indicator}, {letter_extend_format},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_katakana}, {letter_hebrew}, {letter_aletter},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_single_quote}, {letter_double_quote},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_midnumlet}, {letter_midletter}, {letter_midnum},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_numeric}, {letter_extendnumlet}, {letter_panic},
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila {letter_panic}, {letter_apostrophe}, {letter_other}
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen};
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/*
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen Find word boundaries in input text. Based on Unicode standard annex
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen #29, but tailored for FTS purposes.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen http://www.unicode.org/reports/tr29/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
d1623103c73ed7dd8b15b2060ad656fddbed7b46Teemu Huovila Note: The text of tr29 is a living standard, so it keeps
d1623103c73ed7dd8b15b2060ad656fddbed7b46Teemu Huovila changing. In newer specs some characters are combined, like AHLetter
d1623103c73ed7dd8b15b2060ad656fddbed7b46Teemu Huovila (ALetter | Hebrew_Letter) and MidNumLetQ (MidNumLet | Single_Quote).
d1623103c73ed7dd8b15b2060ad656fddbed7b46Teemu Huovila
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila Adaptions:
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila * Added optional WB5a as a configurable option. The cut of prefix is
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila max FTS_WB5A_PREFIX chars.
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila * No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila * Break just once, not before and after.
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila * Break at MidNumLet, except apostrophes (diverging from WB6/WB7).
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen * Other things also (e.g. is_nontoken(), not really pure tr29. Meant
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila to assist in finding individual words.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen*/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenuni_found_word_boundary(struct generic_fts_tokenizer *tok, enum letter_type lt)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* No rule knows what to do with just one char, except the linebreaks
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen we eat away (above) anyway. */
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen if (tok->prev_letter != LETTER_TYPE_NONE) {
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen if (letter_fns[lt].fn(tok))
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen return TRUE;
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen if (lt == LETTER_TYPE_EXTEND || lt == LETTER_TYPE_FORMAT) {
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen /* These types are completely ignored. */
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen } else {
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen add_prev_letter(tok,lt);
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilastatic int
9c34a19b60871463270e61a38aae8050a7633513Teemu Huovilafts_tokenizer_generic_tr29_next(struct fts_tokenizer *_tok,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const unsigned char *data, size_t size,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen size_t *skip_r, const char **token_r,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const char **error_r ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct generic_fts_tokenizer *tok =
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (struct generic_fts_tokenizer *)_tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unichar_t c;
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen size_t i, char_start_i, start_pos = 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen enum letter_type lt;
cf755ce29d2c2499b0da66868c3b15840078d0baTimo Sirainen int char_size;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
afb62c7a52713b079efa16fef6e4de62cf1bf853Timo Sirainen for (i = 0; i < size; ) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen char_start_i = i;
cf755ce29d2c2499b0da66868c3b15840078d0baTimo Sirainen char_size = uni_utf8_get_char_n(data + i, size - i, &c);
cf755ce29d2c2499b0da66868c3b15840078d0baTimo Sirainen i_assert(char_size > 0);
cf755ce29d2c2499b0da66868c3b15840078d0baTimo Sirainen i += char_size;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lt = letter_type(c);
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila /* The WB5a break is detected only when the "after
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila break" char is inspected. That char needs to be
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila reinserted as the "previous char". */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila if (tok->seen_wb5a)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila wb5a_reinsert(tok);
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NONE && is_nontoken(lt)) {
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen /* Skip non-token chars at the beginning of token */
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen i_assert(tok->token->used == 0);
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen start_pos = i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen continue;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila add_letter_c(tok, c);
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uni_found_word_boundary(tok, lt)) {
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen i_assert(char_start_i >= start_pos && size >= start_pos);
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen tok_append_truncated(tok, data + start_pos,
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen char_start_i - start_pos);
afb62c7a52713b079efa16fef6e4de62cf1bf853Timo Sirainen *skip_r = i;
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen fts_tokenizer_generic_tr29_current_token(tok, token_r);
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen return 1;
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen } else if (lt == LETTER_TYPE_APOSTROPHE ||
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen lt == LETTER_TYPE_SINGLE_QUOTE) {
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen /* all apostrophes require special handling */
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen const unsigned char apostrophe_char = '\'';
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen tok_append_truncated(tok, data + start_pos,
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen char_start_i - start_pos);
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen tok_append_truncated(tok, &apostrophe_char, 1);
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen start_pos = i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen i_assert(i >= start_pos && size >= start_pos);
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen tok_append_truncated(tok, data + start_pos, i - start_pos);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *skip_r = i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (size == 0 && tok->token->used > 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* return the last token */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *skip_r = 0;
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen fts_tokenizer_generic_tr29_current_token(tok, token_r);
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen return 1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilastatic int
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_next(struct fts_tokenizer *_tok ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const unsigned char *data ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen size_t size ATTR_UNUSED,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila size_t *skip_r ATTR_UNUSED,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const char **token_r ATTR_UNUSED,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const char **error_r ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_unreached();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_create,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_destroy,
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen fts_tokenizer_generic_reset,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_next
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_tokenizer fts_tokenizer_generic_real = {
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen .name = "generic",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen .v = &generic_tokenizer_vfuncs
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer *fts_tokenizer_generic = &fts_tokenizer_generic_real;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_create,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_destroy,
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen fts_tokenizer_generic_reset,
9c34a19b60871463270e61a38aae8050a7633513Teemu Huovila fts_tokenizer_generic_simple_next
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29 = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_create,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_destroy,
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen fts_tokenizer_generic_reset,
9c34a19b60871463270e61a38aae8050a7633513Teemu Huovila fts_tokenizer_generic_tr29_next
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};