c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#ifndef FTS_TOKENIZER_GENERIC_PRIVATE_H
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#define FTS_TOKENIZER_GENERIC_PRIVATE_H
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenextern const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenextern const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* Word boundary letter type */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenenum letter_type {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_NONE = 0,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_CR,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_LF,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_NEWLINE,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_EXTEND,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_REGIONAL_INDICATOR,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_FORMAT,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_KATAKANA,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_HEBREW_LETTER,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_ALETTER,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_SINGLE_QUOTE,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_DOUBLE_QUOTE,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_MIDNUMLET,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_MIDLETTER,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_MIDNUM,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_NUMERIC,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_EXTENDNUMLET,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_SOT,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_EOT,
65a2c8fef977bcf4625fdb5e2f524b42667cb501Teemu Huovila LETTER_TYPE_APOSTROPHE, /* Own modification to TR29 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen LETTER_TYPE_OTHER /* WB14 "any" */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenenum boundary_algorithm {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen BOUNDARY_ALGORITHM_NONE = 0,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen BOUNDARY_ALGORITHM_SIMPLE,
2bb1ef0b669901fb91ff961e7fb074439ef769abTimo Sirainen#define ALGORITHM_SIMPLE_NAME "simple"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen BOUNDARY_ALGORITHM_TR29
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#define ALGORITHM_TR29_NAME "tr29"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstruct generic_fts_tokenizer {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer tokenizer;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int max_length;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila bool wb5a; /* TR29 rule for prefix separation
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila in e.g. French or Italian. */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila bool seen_wb5a;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila unichar_t prev_letter_c;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila unichar_t letter_c;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen enum boundary_algorithm algorithm;
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila enum letter_type prev_letter;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen enum letter_type prev_prev_letter;
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen size_t untruncated_length;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_t *token;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#endif