fts-tokenizer-generic.c revision a2b6f7f91c904706acbaa0e900df01de6d046861
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher#include "lib.h"
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher#include "buffer.h"
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher#include "unichar.h"
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher#include "bsearch-insert-pos.h"
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher#include "fts-tokenizer-private.h"
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher#include "fts-tokenizer-generic-private.h"
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher#include "word-boundary-data.c"
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher#include "word-break-data.c"
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher#define IS_NONASCII_APOSTROPHE(c) \
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher ((c) == 0x2019 || (c) == 0xFF07)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher#define IS_APOSTROPHE(c) \
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher ((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic unsigned char fts_ascii_word_breaks[128] = {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47: !"#$%&()*+,-./ */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */
74e95cfd9d3939dfe9417d79d2f6fc79b361405fJakub Hrozek 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek};
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozekstatic int
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekfts_tokenizer_generic_create(const char *const *settings,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek struct fts_tokenizer **tokenizer_r,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek const char **error_r)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek{
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek struct generic_fts_tokenizer *tok;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek unsigned int i;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek for (i = 0; settings[i] != NULL; i += 2) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek const char *key = settings[i], *value = settings[i+1];
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (strcmp(key, "maxlen") == 0) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (str_to_uint(value, &max_length) < 0 ||
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek max_length == 0) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek *error_r = t_strdup_printf(
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek "Invalid maxlen setting: %s", value);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return -1;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek }
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek } else if (strcmp(key, "algorithm") == 0) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (strcmp(value, ALGORITHM_TR29_NAME) == 0)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek algo = BOUNDARY_ALGORITHM_TR29;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek else if (strcmp(value, ALGORITHM_SIMPLE_NAME) == 0)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek ;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek else {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek *error_r = t_strdup_printf(
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek "Invalid algorithm: %s", value);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return -1;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek }
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek } else if (strcmp(key, "search") == 0) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* tokenizing a search string -
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek makes no difference to us */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek } else {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek *error_r = t_strdup_printf("Unknown setting: %s", key);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return -1;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek }
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek }
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok = i_new(struct generic_fts_tokenizer, 1);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (algo == BOUNDARY_ALGORITHM_TR29)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek else
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->tokenizer.v = &generic_tokenizer_vfuncs_simple;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->max_length = max_length;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->algorithm = algo;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->token = buffer_create_dynamic(default_pool, 64);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek *tokenizer_r = &tok->tokenizer;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return 0;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek}
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic void
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekfts_tokenizer_generic_destroy(struct fts_tokenizer *_tok)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek{
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek struct generic_fts_tokenizer *tok =
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek (struct generic_fts_tokenizer *)_tok;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek buffer_free(&tok->token);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek i_free(tok);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek}
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic const char *fts_uni_strndup(const unsigned char *data, size_t size)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek{
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek size_t pos;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* if input is truncated with a partial UTF-8 character, drop it */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek (void)uni_utf8_partial_strlen_n(data, size, &pos);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek i_assert(pos > 0);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return t_strndup(data, pos);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek}
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic bool
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekfts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek const char **token_r)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek{
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek const unsigned char *data;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek size_t len = tok->token->used;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (len > 0) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* Remove the trailing apostrophe - it was made
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek into U+0027 earlier. There can be only a single such
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek apostrophe, because otherwise the token would have already
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek been split. We also want to remove the trailing apostrophe
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek only if it's the the last character in the nontruncated
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek token - a truncated token may end with apostrophe. */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek data = tok->token->data;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (data[len-1] == '\'') {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek len--;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek i_assert(len > 0 && data[len-1] != '\'');
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek }
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek }
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek *token_r = len == 0 ? "" :
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek fts_uni_strndup(tok->token->data, len);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek buffer_set_used_size(tok->token, 0);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->prev_letter = LETTER_TYPE_NONE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return (*token_r)[0] != '\0';
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek}
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic bool uint32_find(const uint32_t *data, unsigned int count,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek uint32_t value, unsigned int *idx_r)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek{
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek BINARY_NUMBER_SEARCH(data, count, value, idx_r);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek}
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic bool fts_uni_word_break(unichar_t c)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek{
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek unsigned int idx;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* Unicode General Punctuation, including deprecated characters. */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (c >= 0x2000 && c <= 0x206f)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return TRUE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* From word-break-data.c, which is generated from PropList.txt. */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return TRUE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return TRUE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uint32_find(Quotation_Mark, N_ELEMENTS(Quotation_Mark), c, &idx))
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return TRUE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return TRUE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return TRUE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uint32_find(Pattern_White_Space, N_ELEMENTS(Pattern_White_Space), c, &idx))
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return TRUE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return FALSE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek}
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic inline bool
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekfts_simple_is_word_break(struct generic_fts_tokenizer *tok,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek unichar_t c, bool apostrophe)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek{
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (apostrophe)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek else if (c < 0x80)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return fts_ascii_word_breaks[c] != 0;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek else
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return fts_uni_word_break(c);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek}
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek{
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek struct generic_fts_tokenizer *tok =
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek (struct generic_fts_tokenizer *)_tok;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->prev_letter = LETTER_TYPE_NONE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->prev_prev_letter = LETTER_TYPE_NONE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek buffer_set_used_size(tok->token, 0);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek}
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic void tok_append_truncated(struct generic_fts_tokenizer *tok,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek const unsigned char *data, size_t size)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek{
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek buffer_append(tok->token, data,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek I_MIN(size, tok->max_length - tok->token->used));
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek}
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic int
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekfts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek const unsigned char *data, size_t size,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek size_t *skip_r, const char **token_r,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek const char **error_r ATTR_UNUSED)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek{
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek struct generic_fts_tokenizer *tok =
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek (struct generic_fts_tokenizer *)_tok;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek size_t i, start = 0;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek unsigned int char_size;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek unichar_t c;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek bool apostrophe;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek for (i = 0; i < size; i += char_size) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek i_unreached();
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek char_size = uni_utf8_char_bytes(data[i]);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek apostrophe = IS_APOSTROPHE(c);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (fts_simple_is_word_break(tok, c, apostrophe)) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok_append_truncated(tok, data + start, i - start);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek *skip_r = i + char_size;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return 1;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek }
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek start = i + char_size;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* it doesn't actually matter at this point how whether
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek subsequent apostrophes are handled by prefix
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek skipping or by ignoring empty tokens - they will be
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek dropped in any case. */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->prev_letter = LETTER_TYPE_NONE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek } else if (apostrophe) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* all apostrophes require special handling */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek const unsigned char apostrophe_char = '\'';
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok_append_truncated(tok, data + start, i - start);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (tok->token->used > 0)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok_append_truncated(tok, &apostrophe_char, 1);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek start = i + char_size;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->prev_letter = LETTER_TYPE_SINGLE_QUOTE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek } else {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->prev_letter = LETTER_TYPE_NONE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek }
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek }
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* word boundary not found yet */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok_append_truncated(tok, data + start, i - start);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek *skip_r = i;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* return the last token */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (size == 0) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (fts_tokenizer_generic_simple_current_token(tok, token_r))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return 1;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher }
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return 0;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher}
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher/* TODO: Arrange array searches roughly in order of likelyhood of a match.
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher TODO: Make some array of the arrays, so this can be a foreach loop.
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher TODO: Check for Hangul.
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher U+058A ( ÖŠ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher HYPHEN.
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher TODO
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher*/
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic enum letter_type letter_type(unichar_t c)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher{
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher unsigned int idx;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (IS_APOSTROPHE(c))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_APOSTROPHE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(CR, N_ELEMENTS(CR), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_CR;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(LF, N_ELEMENTS(LF), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_LF;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Newline, N_ELEMENTS(Newline), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_NEWLINE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Extend, N_ELEMENTS(Extend), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_EXTEND;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Regional_Indicator, N_ELEMENTS(Regional_Indicator), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_REGIONAL_INDICATOR;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Format, N_ELEMENTS(Format), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_FORMAT;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Katakana, N_ELEMENTS(Katakana), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_KATAKANA;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Hebrew_Letter, N_ELEMENTS(Hebrew_Letter), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_HEBREW_LETTER;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(ALetter, N_ELEMENTS(ALetter), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_ALETTER;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Single_Quote, N_ELEMENTS(Single_Quote), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_SINGLE_QUOTE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Double_Quote, N_ELEMENTS(Double_Quote), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_DOUBLE_QUOTE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(MidNumLet, N_ELEMENTS(MidNumLet), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_MIDNUMLET;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(MidLetter, N_ELEMENTS(MidLetter), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_MIDLETTER;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(MidNum, N_ELEMENTS(MidNum), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_MIDNUM;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Numeric, N_ELEMENTS(Numeric), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_NUMERIC;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(ExtendNumLet, N_ELEMENTS(ExtendNumLet), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_EXTENDNUMLET;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return LETTER_TYPE_OTHER;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher}
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic bool letter_panic(struct generic_fts_tokenizer *tok ATTR_UNUSED)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher{
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher i_panic("Letter type should not be used.");
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher}
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher/* WB3, WB3a and WB3b, but really different since we try to eat
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher whitespace between words. */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic bool letter_cr_lf_newline(struct generic_fts_tokenizer *tok ATTR_UNUSED)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher{
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return TRUE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher}
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic bool letter_extend_format(struct generic_fts_tokenizer *tok ATTR_UNUSED)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher{
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher /* WB4 */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return FALSE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher}
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
e4c29d1f8e3b2c2b268105f169e5156a0a36aebfOndrej Kosstatic bool letter_regional_indicator(struct generic_fts_tokenizer *tok)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher{
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher /* WB13c */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (tok->prev_letter == LETTER_TYPE_REGIONAL_INDICATOR)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return FALSE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return TRUE; /* Any / Any */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher}
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic bool letter_katakana(struct generic_fts_tokenizer *tok)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher{
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher /* WB13 */
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher if (tok->prev_letter == LETTER_TYPE_KATAKANA)
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher return FALSE;
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher /* WB13b */
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher return FALSE;
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher return TRUE; /* Any / Any */
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher}
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagherstatic bool letter_hebrew(struct generic_fts_tokenizer *tok)
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher{
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher /* WB5 */
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher if (tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher return FALSE;
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher /* WB7 WB7c, except MidNumLet */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (tok->prev_prev_letter == LETTER_TYPE_HEBREW_LETTER &&
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter == LETTER_TYPE_MIDLETTER ||
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return FALSE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher /* WB10 */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (tok->prev_letter == LETTER_TYPE_NUMERIC)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return FALSE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher /* WB13b */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny return FALSE;
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return TRUE; /* Any / Any */
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozek}
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozekstatic bool letter_aletter(struct generic_fts_tokenizer *tok)
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozek{
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek /* WB5 */
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozek if (tok->prev_letter == LETTER_TYPE_ALETTER)
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny return FALSE;
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny
4a1e58d85409fbb7a12ac244c3dbef8c0c1b15dfMichal Zidek /* WB7, except MidNumLet */
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny if (tok->prev_prev_letter == LETTER_TYPE_ALETTER &&
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny tok->prev_letter == LETTER_TYPE_MIDLETTER))
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny return FALSE;
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny /* WB10 */
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny if (tok->prev_letter == LETTER_TYPE_NUMERIC)
04759b59e71c78ab23b84d13dd29d9c6dd680adbMichal Zidek return FALSE;
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny /* WB13b */
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny return FALSE;
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny return TRUE; /* Any / Any */
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny}
4a1e58d85409fbb7a12ac244c3dbef8c0c1b15dfMichal Zidek
4a1e58d85409fbb7a12ac244c3dbef8c0c1b15dfMichal Zidekstatic bool letter_single_quote(struct generic_fts_tokenizer *tok)
4a1e58d85409fbb7a12ac244c3dbef8c0c1b15dfMichal Zidek{
4a1e58d85409fbb7a12ac244c3dbef8c0c1b15dfMichal Zidek /* WB6 */
4a1e58d85409fbb7a12ac244c3dbef8c0c1b15dfMichal Zidek if (tok->prev_letter == LETTER_TYPE_ALETTER ||
4a1e58d85409fbb7a12ac244c3dbef8c0c1b15dfMichal Zidek tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
4a1e58d85409fbb7a12ac244c3dbef8c0c1b15dfMichal Zidek return FALSE;
4a1e58d85409fbb7a12ac244c3dbef8c0c1b15dfMichal Zidek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek /* WB12 */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (tok->prev_letter == LETTER_TYPE_NUMERIC)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return FALSE;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return TRUE; /* Any / Any */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek}
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozekstatic bool letter_double_quote(struct generic_fts_tokenizer *tok)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek{
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return FALSE;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return TRUE; /* Any / Any */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek}
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozekstatic bool letter_midnumlet(struct generic_fts_tokenizer *tok ATTR_UNUSED)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek{
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek /* Break at MidNumLet, non-conformant with WB6/WB7 */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return TRUE;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek}
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozekstatic bool letter_midletter(struct generic_fts_tokenizer *tok)
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek{
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny /* WB6 */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (tok->prev_letter == LETTER_TYPE_ALETTER ||
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny return FALSE;
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny return TRUE; /* Any / Any */
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny}
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zelenystatic bool letter_midnum(struct generic_fts_tokenizer *tok)
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny{
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny /* WB12 */
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny if (tok->prev_letter == LETTER_TYPE_NUMERIC)
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny return FALSE;
b096321a5a02dda0b6b71ba0f9c4d8feacd979e4Michal Zidek
b096321a5a02dda0b6b71ba0f9c4d8feacd979e4Michal Zidek return TRUE; /* Any / Any */
b096321a5a02dda0b6b71ba0f9c4d8feacd979e4Michal Zidek}
b096321a5a02dda0b6b71ba0f9c4d8feacd979e4Michal Zidek
b096321a5a02dda0b6b71ba0f9c4d8feacd979e4Michal Zidekstatic bool letter_numeric(struct generic_fts_tokenizer *tok)
b096321a5a02dda0b6b71ba0f9c4d8feacd979e4Michal Zidek{
b096321a5a02dda0b6b71ba0f9c4d8feacd979e4Michal Zidek /* WB8 */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (tok->prev_letter == LETTER_TYPE_NUMERIC)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return FALSE;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek /* WB9 */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (tok->prev_letter == LETTER_TYPE_ALETTER ||
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return FALSE;
9a9a813906472ffff3911b6006d023e1c6cbff8aSumit Bose
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek /* WB11 */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if(tok->prev_prev_letter == LETTER_TYPE_NUMERIC &&
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek (tok->prev_letter == LETTER_TYPE_MIDNUM ||
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE))
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return FALSE;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek /* WB13b */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return FALSE;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return TRUE; /* Any / Any */
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek}
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zelenystatic bool letter_extendnumlet(struct generic_fts_tokenizer *tok)
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny{
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny /* WB13a */
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny if (tok->prev_letter == LETTER_TYPE_ALETTER ||
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny tok->prev_letter == LETTER_TYPE_HEBREW_LETTER ||
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny tok->prev_letter == LETTER_TYPE_NUMERIC ||
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny tok->prev_letter == LETTER_TYPE_KATAKANA ||
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny return FALSE;
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozek return TRUE; /* Any / Any */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek}
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozekstatic bool letter_apostrophe(struct generic_fts_tokenizer *tok)
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek{
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozek
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek if (tok->prev_letter == LETTER_TYPE_ALETTER ||
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozek return FALSE;
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozek
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozek return TRUE; /* Any / Any */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek}
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozekstatic bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek{
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozek return TRUE; /* Any / Any */
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek}
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozekstatic void
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozekadd_prev_letter(struct generic_fts_tokenizer *tok, enum letter_type lt)
9ab243b369ba317cc964080786dbcdebaf23d6beMichal Zidek{
9ab243b369ba317cc964080786dbcdebaf23d6beMichal Zidek if(tok->prev_letter != LETTER_TYPE_NONE) {
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek tok->prev_prev_letter = tok->prev_letter;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek tok->prev_letter = lt;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek } else
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek tok->prev_letter = lt;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek}
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek/*
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek TODO: Define what to skip between words.
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek TODO: Include double quotation marks? Messes up parsing?
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek TODO: Does this "reverse approach" include too much in "whitespace"?
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek TODO: Possibly use is_word_break()?
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozekstatic bool is_nontoken(enum letter_type lt)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek{
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (lt == LETTER_TYPE_REGIONAL_INDICATOR || lt == LETTER_TYPE_KATAKANA ||
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek lt == LETTER_TYPE_HEBREW_LETTER || lt == LETTER_TYPE_ALETTER ||
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek lt == LETTER_TYPE_NUMERIC)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return FALSE;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek return TRUE;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek}
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek/* The way things are done WB6/7 and WB11/12 "false positives" can
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek leave trailing unwanted chars. They are searched for here. This is
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek very kludgy and should be coded into the rules themselves
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek somehow.
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek*/
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozekstatic bool is_one_past_end(struct generic_fts_tokenizer *tok)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek{
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek /* WB6/7 false positive detected at one past end. */
9ab243b369ba317cc964080786dbcdebaf23d6beMichal Zidek if (tok->prev_letter == LETTER_TYPE_MIDLETTER ||
9ab243b369ba317cc964080786dbcdebaf23d6beMichal Zidek tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE )
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny return TRUE;
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek /* WB11/12 false positive detected at one past end. */
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek if (tok->prev_letter == LETTER_TYPE_MIDNUM ||
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return TRUE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return FALSE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher}
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic void
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherfts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher const char **token_r)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher{
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher const unsigned char *data = tok->token->data;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher ssize_t len = tok->token->used;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (is_one_past_end(tok)) {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher /* delete the last character */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher while ((data[len-1] & 0x80) != 0)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher len--;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek i_assert(len > 0);
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek len--;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek }
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek /* we're skipping all non-token chars at the beginning of the word,
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek so by this point we must have something here - even if we just
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek deleted the last character */
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek i_assert(len > 0);
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek tok->prev_prev_letter = LETTER_TYPE_NONE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter = LETTER_TYPE_NONE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher *token_r = fts_uni_strndup(data, len);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher buffer_set_used_size(tok->token, 0);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher}
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstruct letter_fn {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher bool (*fn)(struct generic_fts_tokenizer *tok);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher};
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic struct letter_fn letter_fns[] = {
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek {letter_panic}, {letter_cr_lf_newline}, {letter_cr_lf_newline},
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher {letter_cr_lf_newline}, {letter_extend_format},
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher {letter_regional_indicator}, {letter_extend_format},
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher {letter_katakana}, {letter_hebrew}, {letter_aletter},
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher {letter_single_quote}, {letter_double_quote},
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher {letter_midnumlet}, {letter_midletter}, {letter_midnum},
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek {letter_numeric}, {letter_extendnumlet}, {letter_panic},
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek {letter_panic}, {letter_apostrophe}, {letter_other}
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek};
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek/*
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek Find word boundaries in input text. Based on Unicode standard annex
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek #29, but tailored for FTS purposes.
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek http://www.unicode.org/reports/tr29/
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher Adaptions:
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher * No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher * Break just once, not before and after.
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher * Break at MidNumLet, except apostrophes (diverging from WB6/WB7).
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek * Other things also (e.g. is_nontoken(), not really pure tr29. Meant
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher to assist in finding individual words.
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek*/
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic bool
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagheruni_found_word_boundary(struct generic_fts_tokenizer *tok, enum letter_type lt)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher{
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher /* No rule knows what to do with just one char, except the linebreaks
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher we eat away (above) anyway. */
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek if (tok->prev_letter != LETTER_TYPE_NONE) {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (letter_fns[lt].fn(tok))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return TRUE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher }
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (lt == LETTER_TYPE_EXTEND || lt == LETTER_TYPE_FORMAT) {
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny /* These types are completely ignored. */
b1caacb098ae99ad65144120fdec4d0fd98ad9d5Pavel Březina } else {
b1caacb098ae99ad65144120fdec4d0fd98ad9d5Pavel Březina add_prev_letter(tok,lt);
b1caacb098ae99ad65144120fdec4d0fd98ad9d5Pavel Březina }
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return FALSE;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher}
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozekstatic int
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozekfts_tokenizer_generic_next_tr29(struct fts_tokenizer *_tok,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher const unsigned char *data, size_t size,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher size_t *skip_r, const char **token_r,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher const char **error_r ATTR_UNUSED)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher{
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny struct generic_fts_tokenizer *tok =
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek (struct generic_fts_tokenizer *)_tok;
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek unichar_t c;
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek size_t i, char_start_i, start_pos = 0;
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny enum letter_type lt;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher /* TODO: Process 8bit chars separately, to speed things up. */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher for (i = 0; i < size; ) {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher char_start_i = i;
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek i_unreached();
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek i += uni_utf8_char_bytes(data[i]);
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek lt = letter_type(c);
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (tok->prev_letter == LETTER_TYPE_NONE && is_nontoken(lt)) {
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek /* Skip non-token chars at the beginning of token */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher i_assert(tok->token->used == 0);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher start_pos = i;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher continue;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher }
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uni_found_word_boundary(tok, lt)) {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher i_assert(char_start_i >= start_pos && size >= start_pos);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok_append_truncated(tok, data + start_pos,
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek char_start_i - start_pos);
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek *skip_r = i;
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek fts_tokenizer_generic_tr29_current_token(tok, token_r);
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek return 1;
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek } else if (lt == LETTER_TYPE_APOSTROPHE ||
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek lt == LETTER_TYPE_SINGLE_QUOTE) {
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek /* all apostrophes require special handling */
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek const unsigned char apostrophe_char = '\'';
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok_append_truncated(tok, data + start_pos,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher char_start_i - start_pos);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok_append_truncated(tok, &apostrophe_char, 1);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher start_pos = i;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher }
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher }
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher i_assert(i >= start_pos && size >= start_pos);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok_append_truncated(tok, data + start_pos, i - start_pos);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher *skip_r = i;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (size == 0 && tok->token->used > 0) {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher /* return the last token */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher *skip_r = 0;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher fts_tokenizer_generic_tr29_current_token(tok, token_r);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return 1;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher }
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher return 0;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher}
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic int
3a3fd60043234038c6ff6584a5b92fb757c4afe1Lukas Slebodnikfts_tokenizer_generic_next(struct fts_tokenizer *_tok ATTR_UNUSED,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher const unsigned char *data ATTR_UNUSED,
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek size_t size ATTR_UNUSED,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher size_t *skip_r ATTR_UNUSED,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher const char **token_r ATTR_UNUSED,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher const char **error_r ATTR_UNUSED)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher{
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher i_unreached();
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher}
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozekstatic const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = {
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek fts_tokenizer_generic_create,
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek fts_tokenizer_generic_destroy,
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek fts_tokenizer_generic_reset,
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek fts_tokenizer_generic_next
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek};
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic const struct fts_tokenizer fts_tokenizer_generic_real = {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher .name = "generic",
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher .v = &generic_tokenizer_vfuncs
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher};
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherconst struct fts_tokenizer *fts_tokenizer_generic = &fts_tokenizer_generic_real;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherconst struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher fts_tokenizer_generic_create,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher fts_tokenizer_generic_destroy,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher fts_tokenizer_generic_reset,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher fts_tokenizer_generic_next_simple
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher};
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherconst struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29 = {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher fts_tokenizer_generic_create,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher fts_tokenizer_generic_destroy,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher fts_tokenizer_generic_reset,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher fts_tokenizer_generic_next_tr29
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher};
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher