fts-tokenizer-generic.c revision 34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "lib.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "buffer.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "unichar.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "bsearch-insert-pos.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "fts-tokenizer-private.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "fts-tokenizer-generic-private.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "word-boundary-data.c"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "word-break-data.c"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic unsigned char fts_ascii_word_boundaries[128] = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47: !"#$%&()*+,-./ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic int
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_create(const char *const *settings,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer **tokenizer_r,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char **error_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct generic_fts_tokenizer *tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
50f659bc47b06939dcb7694c928c9f8aa7f56229Teemu Huovila enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (i = 0; settings[i] != NULL; i += 2) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *key = settings[i], *value = settings[i+1];
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (strcmp(key, "maxlen") == 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (str_to_uint(value, &max_length) < 0 ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen max_length == 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *error_r = t_strdup_printf(
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "Invalid maxlen setting: %s", value);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
505bba1d21a851fb9e719bbf381f8be592afa50aTimo Sirainen } else if (strcmp(key, "algorithm") == 0) {
505bba1d21a851fb9e719bbf381f8be592afa50aTimo Sirainen if (strcmp(value, ALGORITHM_TR29_NAME) == 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen algo = BOUNDARY_ALGORITHM_TR29;
505bba1d21a851fb9e719bbf381f8be592afa50aTimo Sirainen else if (strcmp(value, ALGORITHM_SIMPLE_NAME) == 0)
50f659bc47b06939dcb7694c928c9f8aa7f56229Teemu Huovila ;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen else {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *error_r = t_strdup_printf(
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "Invalid algorithm: %s", value);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
0c827d2094e80ede4c089fc00260d7ffcc764636Timo Sirainen } else if (strcmp(key, "search") == 0) {
0c827d2094e80ede4c089fc00260d7ffcc764636Timo Sirainen /* tokenizing a search string -
0c827d2094e80ede4c089fc00260d7ffcc764636Timo Sirainen makes no difference to us */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen } else {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *error_r = t_strdup_printf("Unknown setting: %s", key);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok = i_new(struct generic_fts_tokenizer, 1);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (algo == BOUNDARY_ALGORITHM_TR29)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen else
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->tokenizer.v = &generic_tokenizer_vfuncs_simple;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->max_length = max_length;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->algorithm = algo;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->token = buffer_create_dynamic(default_pool, 64);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *tokenizer_r = &tok->tokenizer;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_destroy(struct fts_tokenizer *_tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct generic_fts_tokenizer *tok =
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (struct generic_fts_tokenizer *)_tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_free(&tok->token);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_free(tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilastatic int
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilafts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const char **token_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila *token_r = t_strndup(tok->token->data, I_MIN(tok->token->used, tok->max_length));
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_set_used_size(tok->token, 0);
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return 1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* TODO: This is duplicated from unichar.c */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool uint32_find(const uint32_t *data, unsigned int count,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen uint32_t value, unsigned int *idx_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen BINARY_NUMBER_SEARCH(data, count, value, idx_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool is_word_break(unichar_t c)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int idx;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* Unicode General Punctuation, including deprecated characters. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (c >= 0x2000 && c <= 0x206f)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* From word-break-data.c, which is generated from PropList.txt. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Pattern_White_Space, N_ELEMENTS(Pattern_White_Space), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainendata_is_word_boundary(const unsigned char *data, size_t size, size_t *i)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unichar_t c;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (data[*i] < 0x80)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return fts_ascii_word_boundaries[data[*i]] != 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* unicode punctuation? */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uni_utf8_get_char_n(data + *i, size - *i, &c) <= 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_unreached();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *i += uni_utf8_char_bytes(data[*i]) - 1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return is_word_break(c);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainenstatic void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen{
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen struct generic_fts_tokenizer *tok =
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen (struct generic_fts_tokenizer *)_tok;
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen tok->prev_letter = LETTER_TYPE_NONE;
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen tok->prev_prev_letter = LETTER_TYPE_NONE;
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen tok->last_size = 0;
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen buffer_set_used_size(tok->token, 0);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen}
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilastatic int
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const unsigned char *data, size_t size,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila size_t *skip_r, const char **token_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct generic_fts_tokenizer *tok =
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (struct generic_fts_tokenizer *)_tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen size_t i, char_start_i, len, start = 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (i = 0; i < size; i++) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen char_start_i = i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (data_is_word_boundary(data, size, &i)) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen len = char_start_i - start;
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila buffer_append(tok->token, data + start, len);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->token->used == 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* no text read yet */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen start = i + 1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen continue;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* word boundary found - return a new token */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *skip_r = i + 1;
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return fts_tokenizer_generic_simple_current_token(tok, token_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* word boundary not found yet */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen len = i - start;
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila buffer_append(tok->token, data + start, len);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *skip_r = i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila /* return the last token */
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila if (size == 0 && tok->token->used > 0)
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila return fts_tokenizer_generic_simple_current_token(tok, token_r);
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila /* token too long */
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila if (tok->token->used > tok->max_length)
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return fts_tokenizer_generic_simple_current_token(tok, token_r);
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* TODO: Arrange array searches roughly in order of likelyhood of a match.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Make some array of the arrays, so this can be a foreach loop.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Check for Hangul.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen U+058A ( ÖŠ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen HYPHEN.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen*/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic enum letter_type letter_type(unichar_t c)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int idx;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(CR, N_ELEMENTS(CR), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_CR;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(LF, N_ELEMENTS(LF), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_LF;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Newline, N_ELEMENTS(Newline), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_NEWLINE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Extend, N_ELEMENTS(Extend), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_EXTEND;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Regional_Indicator, N_ELEMENTS(Regional_Indicator), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_REGIONAL_INDICATOR;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Format, N_ELEMENTS(Format), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_FORMAT;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Katakana, N_ELEMENTS(Katakana), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_KATAKANA;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Hebrew_Letter, N_ELEMENTS(Hebrew_Letter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_HEBREW_LETTER;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(ALetter, N_ELEMENTS(ALetter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_ALETTER;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Single_Quote, N_ELEMENTS(Single_Quote), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_SINGLE_QUOTE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Double_Quote, N_ELEMENTS(Double_Quote), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_DOUBLE_QUOTE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidNumLet, N_ELEMENTS(MidNumLet), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_MIDNUMLET;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidLetter, N_ELEMENTS(MidLetter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_MIDLETTER;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidNum, N_ELEMENTS(MidNum), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_MIDNUM;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Numeric, N_ELEMENTS(Numeric), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_NUMERIC;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(ExtendNumLet, N_ELEMENTS(ExtendNumLet), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_EXTENDNUMLET;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return LETTER_TYPE_OTHER;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_panic(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_panic("Letter type should not be used.");
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* WB3, WB3a and WB3b, but really different since we try to eat
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen whitespace between words. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_cr_lf_newline(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_extend_format(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB4 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_regional_indicator(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13c */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_REGIONAL_INDICATOR)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_katakana(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_KATAKANA)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13b */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_hebrew(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB5 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB7 WB7c */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_prev_letter == LETTER_TYPE_HEBREW_LETTER &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDLETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB10 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13b */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_aletter(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB5 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB7 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_prev_letter == LETTER_TYPE_ALETTER &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDLETTER))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB10 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13b */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_single_quote(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB6 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB12 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_double_quote(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_midnumlet(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB6 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB12 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_midletter(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB6 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_midnum(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB12 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_numeric(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB8 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB9 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB11 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if(tok->prev_prev_letter == LETTER_TYPE_NUMERIC &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_MIDNUM ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13b */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_extendnumlet(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB13a */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_NUMERIC ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_KATAKANA ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE; /* Any / Any */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenadd_prev_letter(struct generic_fts_tokenizer *tok, enum letter_type lt)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if(tok->prev_letter != LETTER_TYPE_NONE) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_prev_letter = tok->prev_letter;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter = lt;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen } else
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter = lt;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/*
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Define what to skip between words.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Include double quotation marks? Messes up parsing?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Does this "reverse approach" include too much in "whitespace"?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Possibly use is_word_break()?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool is_nonword(enum letter_type lt)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (lt == LETTER_TYPE_REGIONAL_INDICATOR || lt == LETTER_TYPE_KATAKANA ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lt == LETTER_TYPE_HEBREW_LETTER || lt == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lt == LETTER_TYPE_SINGLE_QUOTE || lt == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE; /* TODO: Include LETTER_TYPE_DOUBLE_QUOTE? */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* The way things are done WB6/7 and WB11/12 "false positives" can
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen leave trailing unwanted chars. They are searched for here. This is
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen very kludgy and should be coded into the rules themselves
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen somehow.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen*/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool is_one_past_end(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* Short circuit for simple algorithm. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NONE)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB6/7 false positive detected at one past end. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_MIDLETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE )
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB12/12 false positive detected at one past end. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_MIDNUM ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilastatic int
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilafts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const char **token_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen size_t end_skip = 0;
ade9b3596a1f2555846a093572069a59d1427b13Teemu Huovila ssize_t len;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (is_one_past_end(tok))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen end_skip = tok->last_size;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
ade9b3596a1f2555846a093572069a59d1427b13Teemu Huovila len = I_MIN(tok->token->used, tok->max_length) - end_skip;
ade9b3596a1f2555846a093572069a59d1427b13Teemu Huovila i_assert(len > 0);
ade9b3596a1f2555846a093572069a59d1427b13Teemu Huovila *token_r = t_strndup(tok->token->data, len);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_set_used_size(tok->token, 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_prev_letter = LETTER_TYPE_NONE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter = LETTER_TYPE_NONE;
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return 1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainenstruct letter_fn {
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen bool (*fn)(struct generic_fts_tokenizer *tok);
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen};
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainenstatic struct letter_fn letter_fns[] = {
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_panic}, {letter_cr_lf_newline}, {letter_cr_lf_newline},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_cr_lf_newline}, {letter_extend_format},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_regional_indicator}, {letter_extend_format},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_katakana}, {letter_hebrew}, {letter_aletter},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_single_quote}, {letter_double_quote},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_midnumlet}, {letter_midletter}, {letter_midnum},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_numeric}, {letter_extendnumlet}, {letter_panic},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_panic}, {letter_other}
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen};
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/*
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen Find word boundaries in input text. Based on Unicode standard annex
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen #29, but tailored for FTS purposes.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen http://www.unicode.org/reports/tr29/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen Adaptions: No word boundary at Start-Of-Text or End-of-Text (Wb1 and
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen WB2). Break just once, not before and after. Other things also, not
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen really pure tr29. Meant to assist in finding individual words.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: If this letter_fns based approach is too kludgy, do a FSM with function
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen pointers and transition tables.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Alternative idea: Replace everything with a super simplistic
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "lt != ALETTER, HEBREW, NUMERIC, ... --> word break"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Rules get split up over several functions. Is it too
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen confusing?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen*/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenuni_found_word_boundary(struct generic_fts_tokenizer *tok, enum letter_type lt)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* No rule knows what to do with just one char, except the linebreaks
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen we eat away (above) anyway. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NONE)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen goto false_out;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (letter_fns[lt].fn(tok))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen false_out:
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* Extend and format types are ignored. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (lt == LETTER_TYPE_EXTEND || lt == LETTER_TYPE_FORMAT)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen add_prev_letter(tok,lt);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilastatic int
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_next_tr29(struct fts_tokenizer *_tok,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const unsigned char *data, size_t size,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila size_t *skip_r, const char **token_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct generic_fts_tokenizer *tok =
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (struct generic_fts_tokenizer *)_tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unichar_t c;
ade9b3596a1f2555846a093572069a59d1427b13Teemu Huovila size_t i, char_start_i, start_skip = 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen enum letter_type lt;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* TODO: Process 8bit chars separately, to speed things up. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (i = 0; i < size; i++) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen char_start_i = i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_unreached();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->last_size = uni_utf8_char_bytes(data[i]);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i += tok->last_size - 1; /* Utf8 bytes > 1, for() handles the 1 byte increment. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lt = letter_type(c);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NONE && is_nonword(lt)) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* TODO: test that start_skip works with multibyte utf8 chars */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen start_skip = i + 1; /* Skip non-token chars at start of data */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen continue;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uni_found_word_boundary(tok, lt)) {
ade9b3596a1f2555846a093572069a59d1427b13Teemu Huovila i_assert(char_start_i >= start_skip && size >= start_skip);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_append(tok->token, data + start_skip,
ade9b3596a1f2555846a093572069a59d1427b13Teemu Huovila char_start_i - start_skip);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *skip_r = i + 1;
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return fts_tokenizer_generic_tr29_current_token(tok, token_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
ade9b3596a1f2555846a093572069a59d1427b13Teemu Huovila i_assert(i >= start_skip && size >= start_skip);
ade9b3596a1f2555846a093572069a59d1427b13Teemu Huovila buffer_append(tok->token, data + start_skip, i - start_skip);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *skip_r = i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (size == 0 && tok->token->used > 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* return the last token */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *skip_r = 0;
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return fts_tokenizer_generic_tr29_current_token(tok, token_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilastatic int
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_next(struct fts_tokenizer *_tok ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const unsigned char *data ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen size_t size ATTR_UNUSED,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila size_t *skip_r ATTR_UNUSED,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const char **token_r ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_unreached();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_create,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_destroy,
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen fts_tokenizer_generic_reset,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_next
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_tokenizer fts_tokenizer_generic_real = {
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen .name = "generic",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen .v = &generic_tokenizer_vfuncs
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer *fts_tokenizer_generic = &fts_tokenizer_generic_real;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_create,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_destroy,
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen fts_tokenizer_generic_reset,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_next_simple
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29 = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_create,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_destroy,
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen fts_tokenizer_generic_reset,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_generic_next_tr29
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};