fts-tokenizer-generic.c revision a2b6f7f91c904706acbaa0e900df01de6d046861
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher ((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic unsigned char fts_ascii_word_breaks[128] = {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47: !"#$%&()*+,-./ */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */
74e95cfd9d3939dfe9417d79d2f6fc79b361405fJakub Hrozek 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekfts_tokenizer_generic_create(const char *const *settings,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek const char **error_r)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek unsigned int i;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek const char *key = settings[i], *value = settings[i+1];
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek else if (strcmp(value, ALGORITHM_SIMPLE_NAME) == 0)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* tokenizing a search string -
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek makes no difference to us */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek *error_r = t_strdup_printf("Unknown setting: %s", key);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->tokenizer.v = &generic_tokenizer_vfuncs_simple;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok->token = buffer_create_dynamic(default_pool, 64);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekfts_tokenizer_generic_destroy(struct fts_tokenizer *_tok)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic const char *fts_uni_strndup(const unsigned char *data, size_t size)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* if input is truncated with a partial UTF-8 character, drop it */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek (void)uni_utf8_partial_strlen_n(data, size, &pos);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekfts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek const char **token_r)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek const unsigned char *data;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* Remove the trailing apostrophe - it was made
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek into U+0027 earlier. There can be only a single such
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek apostrophe, because otherwise the token would have already
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek been split. We also want to remove the trailing apostrophe
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek only if it's the the last character in the nontruncated
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek token - a truncated token may end with apostrophe. */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic bool uint32_find(const uint32_t *data, unsigned int count,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek BINARY_NUMBER_SEARCH(data, count, value, idx_r);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek unsigned int idx;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* Unicode General Punctuation, including deprecated characters. */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* From word-break-data.c, which is generated from PropList.txt. */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uint32_find(Quotation_Mark, N_ELEMENTS(Quotation_Mark), c, &idx))
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uint32_find(Pattern_White_Space, N_ELEMENTS(Pattern_White_Space), c, &idx))
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic inline bool
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekfts_simple_is_word_break(struct generic_fts_tokenizer *tok,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek return tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE;
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek else if (c < 0x80)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekstatic void tok_append_truncated(struct generic_fts_tokenizer *tok,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek I_MIN(size, tok->max_length - tok->token->used));
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozekfts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (fts_simple_is_word_break(tok, c, apostrophe)) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok_append_truncated(tok, data + start, i - start);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* it doesn't actually matter at this point how whether
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek subsequent apostrophes are handled by prefix
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek skipping or by ignoring empty tokens - they will be
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek dropped in any case. */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* all apostrophes require special handling */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok_append_truncated(tok, data + start, i - start);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok_append_truncated(tok, &apostrophe_char, 1);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* word boundary not found yet */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek tok_append_truncated(tok, data + start, i - start);
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek /* return the last token */
ebc6ab564dc2a0a2b08c42d727fc403dde4a2dc9Jakub Hrozek if (fts_tokenizer_generic_simple_current_token(tok, token_r))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher/* TODO: Arrange array searches roughly in order of likelyhood of a match.
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher TODO: Make some array of the arrays, so this can be a foreach loop.
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher TODO: Check for Hangul.
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher U+058A ( ÖŠ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic enum letter_type letter_type(unichar_t c)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher unsigned int idx;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(CR, N_ELEMENTS(CR), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(LF, N_ELEMENTS(LF), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Newline, N_ELEMENTS(Newline), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Extend, N_ELEMENTS(Extend), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Regional_Indicator, N_ELEMENTS(Regional_Indicator), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Format, N_ELEMENTS(Format), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Katakana, N_ELEMENTS(Katakana), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Hebrew_Letter, N_ELEMENTS(Hebrew_Letter), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(ALetter, N_ELEMENTS(ALetter), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Single_Quote, N_ELEMENTS(Single_Quote), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Double_Quote, N_ELEMENTS(Double_Quote), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(MidNumLet, N_ELEMENTS(MidNumLet), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(MidLetter, N_ELEMENTS(MidLetter), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(MidNum, N_ELEMENTS(MidNum), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(Numeric, N_ELEMENTS(Numeric), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (uint32_find(ExtendNumLet, N_ELEMENTS(ExtendNumLet), c, &idx))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic bool letter_panic(struct generic_fts_tokenizer *tok ATTR_UNUSED)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher i_panic("Letter type should not be used.");
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher/* WB3, WB3a and WB3b, but really different since we try to eat
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher whitespace between words. */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic bool letter_cr_lf_newline(struct generic_fts_tokenizer *tok ATTR_UNUSED)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic bool letter_extend_format(struct generic_fts_tokenizer *tok ATTR_UNUSED)
e4c29d1f8e3b2c2b268105f169e5156a0a36aebfOndrej Kosstatic bool letter_regional_indicator(struct generic_fts_tokenizer *tok)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (tok->prev_letter == LETTER_TYPE_REGIONAL_INDICATOR)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic bool letter_katakana(struct generic_fts_tokenizer *tok)
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher if (tok->prev_letter == LETTER_TYPE_KATAKANA)
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagherstatic bool letter_hebrew(struct generic_fts_tokenizer *tok)
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher if (tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
346f41f1ede975cb2db0af570f5b454b9b306704Stephen Gallagher /* WB7 WB7c, except MidNumLet */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (tok->prev_prev_letter == LETTER_TYPE_HEBREW_LETTER &&
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter == LETTER_TYPE_MIDLETTER ||
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE))
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (tok->prev_letter == LETTER_TYPE_NUMERIC)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozekstatic bool letter_aletter(struct generic_fts_tokenizer *tok)
4a1e58d85409fbb7a12ac244c3dbef8c0c1b15dfMichal Zidek /* WB7, except MidNumLet */
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny if (tok->prev_prev_letter == LETTER_TYPE_ALETTER &&
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
4a1e58d85409fbb7a12ac244c3dbef8c0c1b15dfMichal Zidekstatic bool letter_single_quote(struct generic_fts_tokenizer *tok)
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozekstatic bool letter_double_quote(struct generic_fts_tokenizer *tok)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozekstatic bool letter_midnumlet(struct generic_fts_tokenizer *tok ATTR_UNUSED)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek /* Break at MidNumLet, non-conformant with WB6/WB7 */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozekstatic bool letter_midletter(struct generic_fts_tokenizer *tok)
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zelenystatic bool letter_midnum(struct generic_fts_tokenizer *tok)
b096321a5a02dda0b6b71ba0f9c4d8feacd979e4Michal Zidekstatic bool letter_numeric(struct generic_fts_tokenizer *tok)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if(tok->prev_prev_letter == LETTER_TYPE_NUMERIC &&
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zelenystatic bool letter_extendnumlet(struct generic_fts_tokenizer *tok)
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozekstatic bool letter_apostrophe(struct generic_fts_tokenizer *tok)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozekstatic bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
7119f0c483049a8850d3075c0b1062f35200a538Jakub Hrozekadd_prev_letter(struct generic_fts_tokenizer *tok, enum letter_type lt)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek TODO: Define what to skip between words.
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek TODO: Include double quotation marks? Messes up parsing?
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek TODO: Does this "reverse approach" include too much in "whitespace"?
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek TODO: Possibly use is_word_break()?
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (lt == LETTER_TYPE_REGIONAL_INDICATOR || lt == LETTER_TYPE_KATAKANA ||
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek lt == LETTER_TYPE_HEBREW_LETTER || lt == LETTER_TYPE_ALETTER ||
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek/* The way things are done WB6/7 and WB11/12 "false positives" can
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek leave trailing unwanted chars. They are searched for here. This is
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek very kludgy and should be coded into the rules themselves
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozekstatic bool is_one_past_end(struct generic_fts_tokenizer *tok)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek /* WB6/7 false positive detected at one past end. */
9ab243b369ba317cc964080786dbcdebaf23d6beMichal Zidek if (tok->prev_letter == LETTER_TYPE_MIDLETTER ||
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE )
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek /* WB11/12 false positive detected at one past end. */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherfts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher const unsigned char *data = tok->token->data;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher /* delete the last character */
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek /* we're skipping all non-token chars at the beginning of the word,
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek so by this point we must have something here - even if we just
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek deleted the last character */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher bool (*fn)(struct generic_fts_tokenizer *tok);
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek {letter_panic}, {letter_cr_lf_newline}, {letter_cr_lf_newline},
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher {letter_cr_lf_newline}, {letter_extend_format},
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher {letter_regional_indicator}, {letter_extend_format},
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher {letter_katakana}, {letter_hebrew}, {letter_aletter},
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher {letter_single_quote}, {letter_double_quote},
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher {letter_midnumlet}, {letter_midletter}, {letter_midnum},
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek {letter_numeric}, {letter_extendnumlet}, {letter_panic},
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek {letter_panic}, {letter_apostrophe}, {letter_other}
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek Find word boundaries in input text. Based on Unicode standard annex
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek #29, but tailored for FTS purposes.
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher * No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher * Break just once, not before and after.
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher * Break at MidNumLet, except apostrophes (diverging from WB6/WB7).
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek * Other things also (e.g. is_nontoken(), not really pure tr29. Meant
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher to assist in finding individual words.
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagheruni_found_word_boundary(struct generic_fts_tokenizer *tok, enum letter_type lt)
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher /* No rule knows what to do with just one char, except the linebreaks
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher we eat away (above) anyway. */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher if (lt == LETTER_TYPE_EXTEND || lt == LETTER_TYPE_FORMAT) {
016e0d7202ff965018e41869c5ab501f86b0d081Jan Zeleny /* These types are completely ignored. */
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozekfts_tokenizer_generic_next_tr29(struct fts_tokenizer *_tok,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher /* TODO: Process 8bit chars separately, to speed things up. */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher for (i = 0; i < size; ) {
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozek if (tok->prev_letter == LETTER_TYPE_NONE && is_nontoken(lt)) {
59415636c92c6e9764ddc65a85ad61002310519dJakub Hrozek /* Skip non-token chars at the beginning of token */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher i_assert(char_start_i >= start_pos && size >= start_pos);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok_append_truncated(tok, data + start_pos,
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek fts_tokenizer_generic_tr29_current_token(tok, token_r);
ba95f1c434b430f0db7fddbd865af10488ecab17Jakub Hrozek /* all apostrophes require special handling */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok_append_truncated(tok, data + start_pos,
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok_append_truncated(tok, &apostrophe_char, 1);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher i_assert(i >= start_pos && size >= start_pos);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher tok_append_truncated(tok, data + start_pos, i - start_pos);
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher /* return the last token */
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagher fts_tokenizer_generic_tr29_current_token(tok, token_r);
3a3fd60043234038c6ff6584a5b92fb757c4afe1Lukas Slebodnikfts_tokenizer_generic_next(struct fts_tokenizer *_tok ATTR_UNUSED,
55d80b1301fe969fb4ba2b9481027887b9462dbbJakub Hrozekstatic const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherstatic const struct fts_tokenizer fts_tokenizer_generic_real = {
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherconst struct fts_tokenizer *fts_tokenizer_generic = &fts_tokenizer_generic_real;
effcbdb12c7ef892f1fd92a745cb33a08ca4ba30Stephen Gallagherconst struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {