fts-tokenizer-generic.c revision 67360bc4d7c3fbcedcf7364ea2290406c8e0d082
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "lib.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "buffer.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "unichar.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "bsearch-insert-pos.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "fts-tokenizer-private.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "fts-tokenizer-generic-private.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "word-boundary-data.c"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "word-break-data.c"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila#define IS_NONASCII_APOSTROPHE(c) \
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila ((c) == 0x2019 || (c) == 0xFF07)
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila#define IS_APOSTROPHE(c) \
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila ((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovilastatic unsigned char fts_ascii_word_breaks[128] = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47: !"#$%&()*+,-./ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic int
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_create(const char *const *settings,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer **tokenizer_r,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char **error_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct generic_fts_tokenizer *tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
50f659bc47b06939dcb7694c928c9f8aa7f56229Teemu Huovila enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (i = 0; settings[i] != NULL; i += 2) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *key = settings[i], *value = settings[i+1];
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (strcmp(key, "maxlen") == 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (str_to_uint(value, &max_length) < 0 ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen max_length == 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *error_r = t_strdup_printf(
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "Invalid maxlen setting: %s", value);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
505bba1d21a851fb9e719bbf381f8be592afa50aTimo Sirainen } else if (strcmp(key, "algorithm") == 0) {
505bba1d21a851fb9e719bbf381f8be592afa50aTimo Sirainen if (strcmp(value, ALGORITHM_TR29_NAME) == 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen algo = BOUNDARY_ALGORITHM_TR29;
505bba1d21a851fb9e719bbf381f8be592afa50aTimo Sirainen else if (strcmp(value, ALGORITHM_SIMPLE_NAME) == 0)
50f659bc47b06939dcb7694c928c9f8aa7f56229Teemu Huovila ;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen else {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *error_r = t_strdup_printf(
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "Invalid algorithm: %s", value);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
0c827d2094e80ede4c089fc00260d7ffcc764636Timo Sirainen } else if (strcmp(key, "search") == 0) {
0c827d2094e80ede4c089fc00260d7ffcc764636Timo Sirainen /* tokenizing a search string -
0c827d2094e80ede4c089fc00260d7ffcc764636Timo Sirainen makes no difference to us */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen } else {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *error_r = t_strdup_printf("Unknown setting: %s", key);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok = i_new(struct generic_fts_tokenizer, 1);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (algo == BOUNDARY_ALGORITHM_TR29)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen else
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->tokenizer.v = &generic_tokenizer_vfuncs_simple;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->max_length = max_length;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->algorithm = algo;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->token = buffer_create_dynamic(default_pool, 64);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *tokenizer_r = &tok->tokenizer;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_destroy(struct fts_tokenizer *_tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct generic_fts_tokenizer *tok =
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (struct generic_fts_tokenizer *)_tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_free(&tok->token);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_free(tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainenstatic const char *fts_uni_strndup(const unsigned char *data, size_t size)
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen{
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen size_t pos;
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen /* if input is truncated with a partial UTF-8 character, drop it */
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen (void)uni_utf8_partial_strlen_n(data, size, &pos);
759c11290d4bedad20cd1e22fe1007cc0893f079Timo Sirainen i_assert(pos > 0);
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen return t_strndup(data, pos);
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen}
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovilastatic bool
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilafts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const char **token_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila const unsigned char *data;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila size_t start = 0, len;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila /* clean trailing and starting apostrophes. they were all made
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila into U+0027 earlier. */
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila data = tok->token->data;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila len = tok->token->used;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila while (len > 0 && data[len - 1] == '\'')
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila len--;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila while (start < len && data[start] == '\'')
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila start++;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila *token_r = len - start == 0 ? "" :
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila fts_uni_strndup(CONST_PTR_OFFSET(tok->token->data, start),
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila len - start);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_set_used_size(tok->token, 0);
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila return (*token_r)[0] != '\0';
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool uint32_find(const uint32_t *data, unsigned int count,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen uint32_t value, unsigned int *idx_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen BINARY_NUMBER_SEARCH(data, count, value, idx_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovilastatic bool fts_uni_word_break(unichar_t c)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int idx;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* Unicode General Punctuation, including deprecated characters. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (c >= 0x2000 && c <= 0x206f)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* From word-break-data.c, which is generated from PropList.txt. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila if (uint32_find(Quotation_Mark, N_ELEMENTS(Quotation_Mark), c, &idx))
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Pattern_White_Space, N_ELEMENTS(Pattern_White_Space), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainenstatic inline bool
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainenfts_simple_is_word_break(struct generic_fts_tokenizer *tok,
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen unichar_t c, bool apostrophe)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen if (apostrophe)
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen return tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE;
dfc9cfd5b80c8a4240841e12425eb23636ce674eTimo Sirainen else if (c < 0x80)
dfc9cfd5b80c8a4240841e12425eb23636ce674eTimo Sirainen return fts_ascii_word_breaks[c] != 0;
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen else
dfc9cfd5b80c8a4240841e12425eb23636ce674eTimo Sirainen return fts_uni_word_break(c);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainenstatic void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen{
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen struct generic_fts_tokenizer *tok =
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen (struct generic_fts_tokenizer *)_tok;
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen tok->prev_letter = LETTER_TYPE_NONE;
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen tok->prev_prev_letter = LETTER_TYPE_NONE;
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen buffer_set_used_size(tok->token, 0);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen}
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainenstatic void tok_append_truncated(struct generic_fts_tokenizer *tok,
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen const unsigned char *data, size_t size)
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen{
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila size_t append_len, pos = 0, appended = 0;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila unichar_t c;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen if (size == 0)
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen return;
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen if (data[0] == '\'' && tok->token->used == 0) {
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen /* Skip apostrophes in the beginning of the token.
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen We need to do it here so that we don't truncate the
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen token too early. */
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen data++;
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen size--;
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen if (size == 0)
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen return;
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen i_assert(data[0] != '\'');
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen }
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen i_assert(tok->max_length >= tok->token->used);
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila append_len = I_MIN(size, tok->max_length - tok->token->used);
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila /* Append only one kind of apostrophes. Simplifies things when returning
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila token. */
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila while (pos < append_len) {
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila if (uni_utf8_get_char_n(data + pos, size - pos, &c) <= 0)
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila i_unreached();
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila if (IS_NONASCII_APOSTROPHE(c)) {
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila buffer_append(tok->token, data, pos);
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila buffer_append_c(tok->token, '\'');
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila appended = pos + 1;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila }
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila pos += uni_utf8_char_bytes(data[pos]);
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila }
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila if (appended < append_len)
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila buffer_append(tok->token, data + appended, append_len - appended);
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen}
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainen
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilastatic int
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const unsigned char *data, size_t size,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen size_t *skip_r, const char **token_r,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const char **error_r ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct generic_fts_tokenizer *tok =
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (struct generic_fts_tokenizer *)_tok;
8acf0a8559c8e969a9079e65bf021a89cebaf10aTimo Sirainen size_t i, start = 0;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila unsigned int char_size;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila unichar_t c;
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen bool apostrophe;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila for (i = 0; i < size; i += char_size) {
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila i_unreached();
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila char_size = uni_utf8_char_bytes(data[i]);
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen apostrophe = IS_APOSTROPHE(c);
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen if (fts_simple_is_word_break(tok, c, apostrophe)) {
8acf0a8559c8e969a9079e65bf021a89cebaf10aTimo Sirainen tok_append_truncated(tok, data + start, i - start);
3fe4e251c34ba63c4b50df72813e2781dccb562eTimo Sirainen if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila *skip_r = i + char_size;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila return 1;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila }
72c4ef3b44c50c662b37bba93b463b0caeb63a4fTimo Sirainen start = i + char_size;
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen /* it doesn't actually matter at this point how whether
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen subsequent apostrophes are handled by prefix
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen skipping or by ignoring empty tokens - they will be
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen dropped in any case. */
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen tok->prev_letter = LETTER_TYPE_NONE;
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen } else {
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen tok->prev_letter = apostrophe ?
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen LETTER_TYPE_SINGLE_QUOTE : LETTER_TYPE_NONE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* word boundary not found yet */
8acf0a8559c8e969a9079e65bf021a89cebaf10aTimo Sirainen tok_append_truncated(tok, data + start, i - start);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *skip_r = i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila /* return the last token */
3fe4e251c34ba63c4b50df72813e2781dccb562eTimo Sirainen if (size == 0) {
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila if (fts_tokenizer_generic_simple_current_token(tok, token_r))
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila return 1;
56a21d5d4ff6e1e7b70425b6680bb3626c4ce1ddTimo Sirainen }
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* TODO: Arrange array searches roughly in order of likelyhood of a match.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Make some array of the arrays, so this can be a foreach loop.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Check for Hangul.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen U+058A ( ÖŠ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen HYPHEN.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen*/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic enum letter_type letter_type(unichar_t c)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int idx;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila if (IS_APOSTROPHE(c))
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila return LETTER_TYPE_APOSTROPHE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(CR, N_ELEMENTS(CR), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_CR;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(LF, N_ELEMENTS(LF), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_LF;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Newline, N_ELEMENTS(Newline), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_NEWLINE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Extend, N_ELEMENTS(Extend), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_EXTEND;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Regional_Indicator, N_ELEMENTS(Regional_Indicator), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_REGIONAL_INDICATOR;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Format, N_ELEMENTS(Format), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_FORMAT;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Katakana, N_ELEMENTS(Katakana), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_KATAKANA;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Hebrew_Letter, N_ELEMENTS(Hebrew_Letter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_HEBREW_LETTER;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(ALetter, N_ELEMENTS(ALetter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_ALETTER;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Single_Quote, N_ELEMENTS(Single_Quote), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_SINGLE_QUOTE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Double_Quote, N_ELEMENTS(Double_Quote), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_DOUBLE_QUOTE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidNumLet, N_ELEMENTS(MidNumLet), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_MIDNUMLET;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidLetter, N_ELEMENTS(MidLetter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_MIDLETTER;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidNum, N_ELEMENTS(MidNum), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_MIDNUM;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Numeric, N_ELEMENTS(Numeric), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_NUMERIC;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(ExtendNumLet, N_ELEMENTS(ExtendNumLet), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_EXTENDNUMLET;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_OTHER;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_panic(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_panic("Letter type should not be used.");
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* WB3, WB3a and WB3b, but really different since we try to eat
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen whitespace between words. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_cr_lf_newline(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_extend_format(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB4 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_regional_indicator(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13c */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_REGIONAL_INDICATOR)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_katakana(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_KATAKANA)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13b */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_hebrew(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB5 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila /* WB7 WB7c, except MidNumLet */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_prev_letter == LETTER_TYPE_HEBREW_LETTER &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDLETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB10 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13b */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_aletter(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB5 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila /* WB7, except MidNumLet */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_prev_letter == LETTER_TYPE_ALETTER &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDLETTER))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB10 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13b */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_single_quote(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB6 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB12 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_double_quote(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovilastatic bool letter_midnumlet(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila /* Break at MidNumLet, non-conformant with WB6/WB7 */
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_midletter(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB6 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_midnum(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB12 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_numeric(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB8 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB9 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB11 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if(tok->prev_prev_letter == LETTER_TYPE_NUMERIC &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_MIDNUM ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13b */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_extendnumlet(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13a */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_NUMERIC ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_KATAKANA ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovilastatic bool letter_apostrophe(struct generic_fts_tokenizer *tok)
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila{
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila if (tok->prev_letter == LETTER_TYPE_ALETTER ||
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila return FALSE;
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila return TRUE; /* Any / Any */
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenadd_prev_letter(struct generic_fts_tokenizer *tok, enum letter_type lt)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if(tok->prev_letter != LETTER_TYPE_NONE) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_prev_letter = tok->prev_letter;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter = lt;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen } else
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter = lt;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/*
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Define what to skip between words.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Include double quotation marks? Messes up parsing?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Does this "reverse approach" include too much in "whitespace"?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Possibly use is_word_break()?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen */
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainenstatic bool is_nontoken(enum letter_type lt)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (lt == LETTER_TYPE_REGIONAL_INDICATOR || lt == LETTER_TYPE_KATAKANA ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lt == LETTER_TYPE_HEBREW_LETTER || lt == LETTER_TYPE_ALETTER ||
6018cfb92a352878c468fedd61c7703c4e2ea30bTeemu Huovila lt == LETTER_TYPE_NUMERIC)
6018cfb92a352878c468fedd61c7703c4e2ea30bTeemu Huovila return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* The way things are done WB6/7 and WB11/12 "false positives" can
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen leave trailing unwanted chars. They are searched for here. This is
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen very kludgy and should be coded into the rules themselves
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen somehow.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen*/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool is_one_past_end(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB6/7 false positive detected at one past end. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_MIDLETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE )
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
73e7fedf77599bb30644bd2e089ce5a8b3a65532Teemu Huovila /* WB11/12 false positive detected at one past end. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_MIDNUM ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainenstatic void
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilafts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const char **token_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen const unsigned char *data = tok->token->data;
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen ssize_t len = tok->token->used;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen if (is_one_past_end(tok)) {
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen /* delete the last character */
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen while ((data[len-1] & 0x80) != 0)
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen len--;
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen i_assert(len > 0);
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen len--;
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen }
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen /* we're skipping all non-token chars at the beginning of the word,
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen so by this point we must have something here - even if we just
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen deleted the last character */
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen i_assert(len > 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_prev_letter = LETTER_TYPE_NONE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter = LETTER_TYPE_NONE;
759c11290d4bedad20cd1e22fe1007cc0893f079Timo Sirainen
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen *token_r = fts_uni_strndup(data, len);
759c11290d4bedad20cd1e22fe1007cc0893f079Timo Sirainen buffer_set_used_size(tok->token, 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainenstruct letter_fn {
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen bool (*fn)(struct generic_fts_tokenizer *tok);
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen};
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainenstatic struct letter_fn letter_fns[] = {
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_panic}, {letter_cr_lf_newline}, {letter_cr_lf_newline},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_cr_lf_newline}, {letter_extend_format},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_regional_indicator}, {letter_extend_format},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_katakana}, {letter_hebrew}, {letter_aletter},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_single_quote}, {letter_double_quote},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_midnumlet}, {letter_midletter}, {letter_midnum},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_numeric}, {letter_extendnumlet}, {letter_panic},
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila {letter_panic}, {letter_apostrophe}, {letter_other}
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen};
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/*
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen Find word boundaries in input text. Based on Unicode standard annex
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen #29, but tailored for FTS purposes.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen http://www.unicode.org/reports/tr29/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila Adaptions:
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila * No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila * Break just once, not before and after.
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila * Break at MidNumLet, except apostrophes (diverging from WB6/WB7).
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen * Other things also (e.g. is_nontoken(), not really pure tr29. Meant
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila to assist in finding individual words.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen*/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenuni_found_word_boundary(struct generic_fts_tokenizer *tok, enum letter_type lt)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* No rule knows what to do with just one char, except the linebreaks
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen we eat away (above) anyway. */
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen if (tok->prev_letter != LETTER_TYPE_NONE) {
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen if (letter_fns[lt].fn(tok))
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen return TRUE;
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen if (lt == LETTER_TYPE_EXTEND || lt == LETTER_TYPE_FORMAT) {
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen /* These types are completely ignored. */
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen } else {
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen add_prev_letter(tok,lt);
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilastatic int
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_next_tr29(struct fts_tokenizer *_tok,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const unsigned char *data, size_t size,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen size_t *skip_r, const char **token_r,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const char **error_r ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct generic_fts_tokenizer *tok =
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (struct generic_fts_tokenizer *)_tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unichar_t c;
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen size_t i, char_start_i, start_pos = 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen enum letter_type lt;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* TODO: Process 8bit chars separately, to speed things up. */
afb62c7a52713b079efa16fef6e4de62cf1bf853Timo Sirainen for (i = 0; i < size; ) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen char_start_i = i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_unreached();
afb62c7a52713b079efa16fef6e4de62cf1bf853Timo Sirainen i += uni_utf8_char_bytes(data[i]);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lt = letter_type(c);
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NONE && is_nontoken(lt)) {
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen /* Skip non-token chars at the beginning of token */
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen i_assert(tok->token->used == 0);
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen start_pos = i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen continue;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uni_found_word_boundary(tok, lt)) {
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen i_assert(char_start_i >= start_pos && size >= start_pos);
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen tok_append_truncated(tok, data + start_pos,
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen char_start_i - start_pos);
afb62c7a52713b079efa16fef6e4de62cf1bf853Timo Sirainen *skip_r = i;
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen fts_tokenizer_generic_tr29_current_token(tok, token_r);
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen return 1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen i_assert(i >= start_pos && size >= start_pos);
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen tok_append_truncated(tok, data + start_pos, i - start_pos);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *skip_r = i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (size == 0 && tok->token->used > 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* return the last token */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *skip_r = 0;
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen fts_tokenizer_generic_tr29_current_token(tok, token_r);
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen return 1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilastatic int
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_next(struct fts_tokenizer *_tok ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const unsigned char *data ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen size_t size ATTR_UNUSED,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila size_t *skip_r ATTR_UNUSED,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const char **token_r ATTR_UNUSED,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const char **error_r ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_unreached();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_create,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_destroy,
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen fts_tokenizer_generic_reset,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_next
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_tokenizer fts_tokenizer_generic_real = {
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen .name = "generic",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen .v = &generic_tokenizer_vfuncs
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer *fts_tokenizer_generic = &fts_tokenizer_generic_real;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_create,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_destroy,
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen fts_tokenizer_generic_reset,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_next_simple
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29 = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_create,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_destroy,
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen fts_tokenizer_generic_reset,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_next_tr29
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};