c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#ifndef FTS_TOKENIZER_H
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#define FTS_TOKENIZER_H
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/*
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen Settings are given in the form of a const char * const *settings =
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila {"key, "value", "key2", "value2", NULL} array of string pairs. Some
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila keys, like "no_parent" and "search" are a sort of boolean and the
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila value does not matter, just mentioning the key enables the functionality.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen The array has to be NULL terminated.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen*/
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* Email address header tokenizer that returns "user@domain.org" input as
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "user@domain.org" token as well as passing it through to the parent
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (generic) tokenizer, which also returns "user", "domain" and "org".
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen This allows searching the mails with their individual components, but also
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen allows doing an explicit "user@domain" search, which returns only mails
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen matching that exact address (instead of e.g. a mail with both user@domain2
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen and user2@domain words). */
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila/* Settings:
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila "no_parent", Return only our tokens, no data for parent to process.
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila Defaults to disabled. Should normally not be needed.
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila "search" Remove addresses from parent data stream, so they are not processed
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila further. Defaults to disabled. Enable by defining the keyword (and any
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila value). */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenextern const struct fts_tokenizer *fts_tokenizer_email_address;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* Generic email content tokenizer. Cuts text into tokens. */
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila/* Settings:
211c638d81d382517d196ad47565e0d85012c927klemens "maxlen" Maximum length of token, before an arbitrary cut off is made.
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH.
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "algorithm", accepted values are "simple" or "tr29". Defines the
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen method for looking for word boundaries. Simple is faster and will
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen work for many texts, especially those using latin alphabets, but
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen leaves corner cases. The tr29 implements a version of Unicode
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen technical report 29 word boundary lookup. It might work better with
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen e.g. texts containing Katakana or hebrew characters, but it is not
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen possible to use a single algorithm for all existing languages. It
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen is also significantly slower than simple. The algorithms also
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen differ in some details, e.g. simple will cut "a.b" and tr29 will
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen not. The default is "simple" */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenextern const struct fts_tokenizer *fts_tokenizer_generic;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila/*
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila Tokenizing workflow, find --> create --> filter --> destroy.
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila Do init before first use and deinit after all done.
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila */
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila/* Register all built-in tokenizers. */
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovilavoid fts_tokenizers_init(void);
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovilavoid fts_tokenizers_deinit(void);
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer *fts_tokenizer_find(const char *name);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovila/* Create a new tokenizer. The settings are described above. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenint fts_tokenizer_create(const struct fts_tokenizer *tok_class,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer *parent,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *const *settings,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer **tokenizer_r,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char **error_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenvoid fts_tokenizer_ref(struct fts_tokenizer *tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenvoid fts_tokenizer_unref(struct fts_tokenizer **tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen/* Reset FTS tokenizer state */
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainenvoid fts_tokenizer_reset(struct fts_tokenizer *tok);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila/*
16dd1fd16f0c6dbd4a057327370b432684e301ecTimo Sirainen Returns 1 if *token_r was returned, 0 if more data is needed, -1 on error.
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila This function should be called with the same data+size until it
16dd1fd16f0c6dbd4a057327370b432684e301ecTimo Sirainen returns 0. After that fts_tokenizer_final() should be called until it
16dd1fd16f0c6dbd4a057327370b432684e301ecTimo Sirainen returns 0 to flush out the final token(s).
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen data must contain only valid complete UTF-8 sequences, but otherwise it
16dd1fd16f0c6dbd4a057327370b432684e301ecTimo Sirainen may be broken into however small pieces. (Input to this function typically
16dd1fd16f0c6dbd4a057327370b432684e301ecTimo Sirainen comes from message-decoder, which returns only complete UTF-8 sequences.) */
3dc5a231160859c9627157dc53a94d5e4494fe9fTeemu Huovila
2bb1ef0b669901fb91ff961e7fb074439ef769abTimo Sirainenint fts_tokenizer_next(struct fts_tokenizer *tok,
2bb1ef0b669901fb91ff961e7fb074439ef769abTimo Sirainen const unsigned char *data, size_t size,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const char **token_r, const char **error_r);
16dd1fd16f0c6dbd4a057327370b432684e301ecTimo Sirainen/* Returns same as fts_tokenizer_next(). */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainenint fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r,
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const char **error_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
4ef1f9f3293965734e6e3c38c191ceb2246a721fTeemu Huovilaconst char *fts_tokenizer_name(const struct fts_tokenizer *tok);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#endif