fts-tokenizer-generic.c revision 50f659bc47b06939dcb7694c928c9f8aa7f56229
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic unsigned char fts_ascii_word_boundaries[128] = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47: !"#$%&()*+,-./ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_create(const char *const *settings,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char **error_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
50f659bc47b06939dcb7694c928c9f8aa7f56229Teemu Huovila enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *key = settings[i], *value = settings[i+1];
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen } else if (strcasecmp(key, "algorithm") == 0) {
50f659bc47b06939dcb7694c928c9f8aa7f56229Teemu Huovila if (strcasecmp(value, ALGORITHM_TR29_NAME) == 0)
50f659bc47b06939dcb7694c928c9f8aa7f56229Teemu Huovila else if (strcasecmp(value, ALGORITHM_SIMPLE_NAME) == 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *error_r = t_strdup_printf("Unknown setting: %s", key);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->tokenizer.v = &generic_tokenizer_vfuncs_simple;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->token = buffer_create_dynamic(default_pool, 64);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_destroy(struct fts_tokenizer *_tok)
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilafts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const char **token_r)
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila *token_r = t_strndup(tok->token->data, tok->token->used);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* TODO: This is duplicated from unichar.c */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool uint32_find(const uint32_t *data, unsigned int count,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen BINARY_NUMBER_SEARCH(data, count, value, idx_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int idx;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* Unicode General Punctuation, including deprecated characters. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* From word-break-data.c, which is generated from PropList.txt. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Pattern_White_Space, N_ELEMENTS(Pattern_White_Space), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainendata_is_word_boundary(const unsigned char *data, size_t size, size_t *i)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return fts_ascii_word_boundaries[data[*i]] != 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* unicode punctuation? */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uni_utf8_get_char_n(data + *i, size - *i, &c) <= 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (i = 0; i < size; i++) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* no text read yet */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* word boundary found - return a new token */
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return fts_tokenizer_generic_simple_current_token(tok, token_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* word boundary not found yet */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_append(tok->token, data + start, I_MIN(len, tok->max_length));
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* return the last token */
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return fts_tokenizer_generic_simple_current_token(tok, token_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* TODO: Arrange array searches roughly in order of likelyhood of a match.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Make some array of the arrays, so this can be a foreach loop.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Check for Hangul.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen U+058A ( ÖŠ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic enum letter_type letter_type(unichar_t c)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int idx;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Newline, N_ELEMENTS(Newline), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Extend, N_ELEMENTS(Extend), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Regional_Indicator, N_ELEMENTS(Regional_Indicator), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Format, N_ELEMENTS(Format), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Katakana, N_ELEMENTS(Katakana), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Hebrew_Letter, N_ELEMENTS(Hebrew_Letter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(ALetter, N_ELEMENTS(ALetter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Single_Quote, N_ELEMENTS(Single_Quote), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Double_Quote, N_ELEMENTS(Double_Quote), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidNumLet, N_ELEMENTS(MidNumLet), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidLetter, N_ELEMENTS(MidLetter), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(MidNum, N_ELEMENTS(MidNum), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(Numeric, N_ELEMENTS(Numeric), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uint32_find(ExtendNumLet, N_ELEMENTS(ExtendNumLet), c, &idx))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_panic(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* WB3, WB3a and WB3b, but really different since we try to eat
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen whitespace between words. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_cr_lf_newline(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_extend_format(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_regional_indicator(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_REGIONAL_INDICATOR)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_katakana(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_hebrew(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB7 WB7c */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_prev_letter == LETTER_TYPE_HEBREW_LETTER &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_aletter(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_prev_letter == LETTER_TYPE_ALETTER &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_single_quote(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_double_quote(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_midnumlet(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_midletter(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_midnum(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_numeric(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if(tok->prev_prev_letter == LETTER_TYPE_NUMERIC &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_extendnumlet(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_HEBREW_LETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenadd_prev_letter(struct generic_fts_tokenizer *tok, enum letter_type lt)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Define what to skip between words.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Include double quotation marks? Messes up parsing?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Does this "reverse approach" include too much in "whitespace"?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Possibly use is_word_break()?
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (lt == LETTER_TYPE_REGIONAL_INDICATOR || lt == LETTER_TYPE_KATAKANA ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lt == LETTER_TYPE_HEBREW_LETTER || lt == LETTER_TYPE_ALETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lt == LETTER_TYPE_SINGLE_QUOTE || lt == LETTER_TYPE_NUMERIC)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE; /* TODO: Include LETTER_TYPE_DOUBLE_QUOTE? */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* The way things are done WB6/7 and WB11/12 "false positives" can
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen leave trailing unwanted chars. They are searched for here. This is
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen very kludgy and should be coded into the rules themselves
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool is_one_past_end(struct generic_fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* Short circuit for simple algorithm. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB6/7 false positive detected at one past end. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_MIDLETTER ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE )
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* WB12/12 false positive detected at one past end. */
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovilafts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila const char **token_r)
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila *token_r = t_strndup(tok->token->data, tok->token->used - end_skip);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen Find word boundaries in input text. Based on Unicode standard annex
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen #29, but tailored for FTS purposes.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen Adaptions: No word boundary at Start-Of-Text or End-of-Text (Wb1 and
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen WB2). Break just once, not before and after. Other things also, not
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen really pure tr29. Meant to assist in finding individual words.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: If this letter_fns based approach is too kludgy, do a FSM with function
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen pointers and transition tables.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Alternative idea: Replace everything with a super simplistic
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "lt != ALETTER, HEBREW, NUMERIC, ... --> word break"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen TODO: Rules get split up over several functions. Is it too
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenuni_found_word_boundary(struct generic_fts_tokenizer *tok, enum letter_type lt)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* No rule knows what to do with just one char, except the linebreaks
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen we eat away (above) anyway. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* Extend and format types are ignored. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (lt == LETTER_TYPE_EXTEND || lt == LETTER_TYPE_FORMAT)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_next_tr29(struct fts_tokenizer *_tok,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* TODO: Process 8bit chars separately, to speed things up. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (i = 0; i < size; i++) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->last_size = uni_utf8_char_bytes(data[i]);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i += tok->last_size - 1; /* Utf8 bytes > 1, for() handles the 1 byte increment. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_letter == LETTER_TYPE_NONE && is_nonword(lt)) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* TODO: test that start_skip works with multibyte utf8 chars */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen start_skip = i + 1; /* Skip non-token chars at start of data */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(len >= start_skip && size >= start_skip);
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return fts_tokenizer_generic_tr29_current_token(tok, token_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(len >= start_skip && size >= start_skip);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_append(tok->token, data + start_skip, len - start_skip);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* return the last token */
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila return fts_tokenizer_generic_tr29_current_token(tok, token_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_generic_next(struct fts_tokenizer *_tok ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_tokenizer fts_tokenizer_generic_real = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer *fts_tokenizer_generic = &fts_tokenizer_generic_real;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {