bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila#define FTS_WB5A_PREFIX_MAX_LENGTH 3 /* Including apostrophe */
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovilastatic unsigned char fts_ascii_word_breaks[128] = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47: !"#$%&()*+,-./ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_create(const char *const *settings,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char **error_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
50f659bc47b06939dcb7694c928c9f8aa7f56229Teemu Huovila enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *key = settings[i], *value = settings[i+1];
505bba1d21a851fb9e719bbf381f8be592afa50aTimo Sirainen else if (strcmp(value, ALGORITHM_SIMPLE_NAME) == 0)
0c827d2094e80ede4c089fc00260d7ffcc764636Timo Sirainen /* tokenizing a search string -
0c827d2094e80ede4c089fc00260d7ffcc764636Timo Sirainen makes no difference to us */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *error_r = t_strdup_printf("Unknown setting: %s", key);
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila if (wb5a && algo != BOUNDARY_ALGORITHM_TR29) {
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila *error_r = "Can not use WB5a for algorithms other than TR29.";
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->tokenizer.v = &generic_tokenizer_vfuncs_simple;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->token = buffer_create_dynamic(default_pool, 64);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_destroy(struct fts_tokenizer *_tok)
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilafts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const char **token_r)
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen if (tok->untruncated_length <= tok->max_length) {
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen /* Remove the trailing apostrophe - it was made
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen into U+0027 earlier. There can be only a single such
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen apostrophe, because otherwise the token would have already
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen been split. We also want to remove the trailing apostrophe
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen only if it's the the last character in the nontruncated
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen token - a truncated token may end with apostrophe. */
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen fts_tokenizer_delete_trailing_partial_char(data, &len);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool uint32_find(const uint32_t *data, unsigned int count,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen BINARY_NUMBER_SEARCH(data, count, value, idx_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* Unicode General Punctuation, including deprecated characters. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* From word-break-data.c, which is generated from PropList.txt. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila if (uint32_find(Quotation_Mark, N_ELEMENTS(Quotation_Mark), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Pattern_White_Space, N_ELEMENTS(Pattern_White_Space), c, &idx))
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainenstatic inline bool
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainenfts_simple_is_word_break(struct generic_fts_tokenizer *tok,
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen return tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE;
dfc9cfd5b80c8a4240841e12425eb23636ce674eTimo Sirainen else if (c < 0x80)
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainenstatic void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
868aa947baeabd4328ca70525b9dd678ea389bf2Timo Sirainenstatic void tok_append_truncated(struct generic_fts_tokenizer *tok,
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen I_MIN(size, tok->max_length - tok->token->used));
9c34a19b60871463270e61a38aae8050a7633513Teemu Huovilafts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok,
cf755ce29d2c2499b0da66868c3b15840078d0baTimo Sirainen char_size = uni_utf8_get_char_n(data + i, size - i, &c);
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen if (fts_simple_is_word_break(tok, c, apostrophe)) {
8acf0a8559c8e969a9079e65bf021a89cebaf10aTimo Sirainen tok_append_truncated(tok, data + start, i - start);
3fe4e251c34ba63c4b50df72813e2781dccb562eTimo Sirainen if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen /* it doesn't actually matter at this point how whether
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen subsequent apostrophes are handled by prefix
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen skipping or by ignoring empty tokens - they will be
e3f8b4fd89a11442c3208cd3210cfaccd4835386Timo Sirainen dropped in any case. */
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen /* all apostrophes require special handling */
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen tok_append_truncated(tok, data + start, i - start);
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen tok_append_truncated(tok, &apostrophe_char, 1);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* word boundary not found yet */
8acf0a8559c8e969a9079e65bf021a89cebaf10aTimo Sirainen tok_append_truncated(tok, data + start, i - start);
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila /* return the last token */
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila if (fts_tokenizer_generic_simple_current_token(tok, token_r))
211c638d81d382517d196ad47565e0d85012c927klemens/* TODO: Arrange array searches roughly in order of likelihood of a match.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Make some array of the arrays, so this can be a foreach loop.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Check for Hangul.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen U+058A ( ÖŠ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic enum letter_type letter_type(unichar_t c)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Newline, N_ELEMENTS(Newline), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Extend, N_ELEMENTS(Extend), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Regional_Indicator, N_ELEMENTS(Regional_Indicator), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Format, N_ELEMENTS(Format), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Katakana, N_ELEMENTS(Katakana), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Hebrew_Letter, N_ELEMENTS(Hebrew_Letter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(ALetter, N_ELEMENTS(ALetter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Single_Quote, N_ELEMENTS(Single_Quote), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Double_Quote, N_ELEMENTS(Double_Quote), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidNumLet, N_ELEMENTS(MidNumLet), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidLetter, N_ELEMENTS(MidLetter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidNum, N_ELEMENTS(MidNum), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Numeric, N_ELEMENTS(Numeric), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(ExtendNumLet, N_ELEMENTS(ExtendNumLet), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_panic(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* WB3, WB3a and WB3b, but really different since we try to eat
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen whitespace between words. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_cr_lf_newline(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_extend_format(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_regional_indicator(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_REGIONAL_INDICATOR)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_katakana(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_hebrew(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila /* WB7 WB7c, except MidNumLet */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_prev_letter == LETTER_TYPE_HEBREW_LETTER &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_aletter(struct generic_fts_tokenizer *tok)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila if (IS_WB5A_APOSTROPHE(tok->prev_letter_c) && IS_VOWEL(tok->letter_c)) {
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila /* WB7, except MidNumLet */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_prev_letter == LETTER_TYPE_ALETTER &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_single_quote(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_double_quote(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE)
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovilastatic bool letter_midnumlet(struct generic_fts_tokenizer *tok ATTR_UNUSED)
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila /* Break at MidNumLet, non-conformant with WB6/WB7 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_midletter(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_midnum(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_numeric(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if(tok->prev_prev_letter == LETTER_TYPE_NUMERIC &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_extendnumlet(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER ||
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovilastatic bool letter_apostrophe(struct generic_fts_tokenizer *tok)
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila if (tok->prev_letter == LETTER_TYPE_ALETTER ||
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovilastatic bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenadd_prev_letter(struct generic_fts_tokenizer *tok, enum letter_type lt)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovilaadd_letter_c(struct generic_fts_tokenizer *tok, unichar_t c)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Define what to skip between words.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Include double quotation marks? Messes up parsing?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Does this "reverse approach" include too much in "whitespace"?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Possibly use is_word_break()?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (lt == LETTER_TYPE_REGIONAL_INDICATOR || lt == LETTER_TYPE_KATAKANA ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lt == LETTER_TYPE_HEBREW_LETTER || lt == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* The way things are done WB6/7 and WB11/12 "false positives" can
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen leave trailing unwanted chars. They are searched for here. This is
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen very kludgy and should be coded into the rules themselves
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool is_one_past_end(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB6/7 false positive detected at one past end. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_MIDLETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE )
73e7fedf77599bb30644bd2e089ce5a8b3a65532Teemu Huovila /* WB11/12 false positive detected at one past end. */
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilafts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const char **token_r)
b15ff9096eab230fa041996d9340b96ac7343c0dTimo Sirainen /* delete the last character */
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen } else if (tok->untruncated_length > tok->max_length) {
1b8da092e7ac34e81b917db7bdaba484acf1921cTimo Sirainen fts_tokenizer_delete_trailing_partial_char(data, &len);
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen /* we're skipping all non-token chars at the beginning of the word,
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen so by this point we must have something here - even if we just
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen deleted the last character */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovilastatic void wb5a_reinsert(struct generic_fts_tokenizer *tok)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila buffer_insert(tok->token, 0, str_data(utf8_str), str_len(utf8_str));
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila tok->prev_letter = letter_type(tok->letter_c);
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen bool (*fn)(struct generic_fts_tokenizer *tok);
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_panic}, {letter_cr_lf_newline}, {letter_cr_lf_newline},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_cr_lf_newline}, {letter_extend_format},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_regional_indicator}, {letter_extend_format},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_katakana}, {letter_hebrew}, {letter_aletter},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_midnumlet}, {letter_midletter}, {letter_midnum},
34c7e8b10f94e9b76bd5b64b146c0c7e1a65e0f9Timo Sirainen {letter_numeric}, {letter_extendnumlet}, {letter_panic},
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila {letter_panic}, {letter_apostrophe}, {letter_other}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen Find word boundaries in input text. Based on Unicode standard annex
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen #29, but tailored for FTS purposes.
d1623103c73ed7dd8b15b2060ad656fddbed7b46Teemu Huovila Note: The text of tr29 is a living standard, so it keeps
d1623103c73ed7dd8b15b2060ad656fddbed7b46Teemu Huovila changing. In newer specs some characters are combined, like AHLetter
d1623103c73ed7dd8b15b2060ad656fddbed7b46Teemu Huovila (ALetter | Hebrew_Letter) and MidNumLetQ (MidNumLet | Single_Quote).
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila * Added optional WB5a as a configurable option. The cut of prefix is
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila max FTS_WB5A_PREFIX chars.
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila * No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila * Break just once, not before and after.
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila * Break at MidNumLet, except apostrophes (diverging from WB6/WB7).
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen * Other things also (e.g. is_nontoken(), not really pure tr29. Meant
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila to assist in finding individual words.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenuni_found_word_boundary(struct generic_fts_tokenizer *tok, enum letter_type lt)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* No rule knows what to do with just one char, except the linebreaks
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen we eat away (above) anyway. */
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen if (lt == LETTER_TYPE_EXTEND || lt == LETTER_TYPE_FORMAT) {
05031501650185cde7150fd538d35ce66d87b526Timo Sirainen /* These types are completely ignored. */
9c34a19b60871463270e61a38aae8050a7633513Teemu Huovilafts_tokenizer_generic_tr29_next(struct fts_tokenizer *_tok,
afb62c7a52713b079efa16fef6e4de62cf1bf853Timo Sirainen for (i = 0; i < size; ) {
cf755ce29d2c2499b0da66868c3b15840078d0baTimo Sirainen char_size = uni_utf8_get_char_n(data + i, size - i, &c);
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila /* The WB5a break is detected only when the "after
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila break" char is inspected. That char needs to be
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila reinserted as the "previous char". */
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NONE && is_nontoken(lt)) {
bf8b21a6647479bd9b2ccc8866ad6b077ed9af41Timo Sirainen /* Skip non-token chars at the beginning of token */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH)
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen i_assert(char_start_i >= start_pos && size >= start_pos);
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen fts_tokenizer_generic_tr29_current_token(tok, token_r);
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen /* all apostrophes require special handling */
a2b6f7f91c904706acbaa0e900df01de6d046861Timo Sirainen tok_append_truncated(tok, &apostrophe_char, 1);
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen i_assert(i >= start_pos && size >= start_pos);
67360bc4d7c3fbcedcf7364ea2290406c8e0d082Timo Sirainen tok_append_truncated(tok, data + start_pos, i - start_pos);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* return the last token */
3deb8a4df8b4fec55170d518db6cf4c963baf5c6Timo Sirainen fts_tokenizer_generic_tr29_current_token(tok, token_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_next(struct fts_tokenizer *_tok ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_tokenizer fts_tokenizer_generic_real = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer *fts_tokenizer_generic = &fts_tokenizer_generic_real;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {