fts-tokenizer.h revision 8b1a9a4d63b0abccdf7cb1acb8359d5396dd657b
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen Settings are given in the form of a const char * const *settings =
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen {"key, "value", "key2", "value2", NULL} array of string pairs. Some
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen keys, like "no_parent" and "search" are a sort of boolean and the
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen value does not matter, just mentioning the key enables the functionality.
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen The array has to be NULL terminated.
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen/* Email address header tokenizer that returns "user@domain.org" input as
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen "user@domain.org" token as well as passing it through to the parent
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen (generic) tokenizer, which also returns "user", "domain" and "org".
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen This allows searching the mails with their individual components, but also
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen allows doing an explicit "user@domain" search, which returns only mails
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen matching that exact address (instead of e.g. a mail with both user@domain2
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen and user2@domain words). */
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen "no_parent", Return only our tokens, no data for parent to process.
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen Defaults to disabled. Should normally not be needed.
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen "search" Remove addresses from parent data stream, so they are not processed
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen further. Defaults to disabled. Enable by defining the keyword (and any
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainenextern const struct fts_tokenizer *fts_tokenizer_email_address;
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen/* Generic email content tokenizer. Cuts text into tokens. */
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen "maxlen" Maximum length of token, before an arbitary cut off is made.
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH.
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen "algorithm", accepted values are "simple" or "tr29". Defines the
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen method for looking for word boundaries. Simple is faster and will
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen work for many texts, especially those using latin alphabets, but
ad5ece2a07cca5fa033287f70bafdd312e2338a5Timo Sirainen leaves corner cases. The tr29 implements a version of Unicode
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen technical report 29 word boundary lookup. It might work better with
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen e.g. texts containing Katakana or hebrew characters, but it is not
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen possible to use a single algorithm for all existing languages. It
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen is also significantly slower than simple. The algorithms also
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen differ in some details, e.g. simple will cut "a.b" and tr29 will
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen not. The default is "simple" */
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainenextern const struct fts_tokenizer *fts_tokenizer_generic;
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen Tokenizing workflow, find --> create --> filter --> destroy.
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen Do init before first use and deinit after all done.
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen/* Register all built-in tokenizers. */
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainenconst struct fts_tokenizer *fts_tokenizer_find(const char *name);
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen/* Create a new tokenizer. The settings are described above. */
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainenint fts_tokenizer_create(const struct fts_tokenizer *tok_class,
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen const char *const *settings,
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen const char **error_r);
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainenvoid fts_tokenizer_ref(struct fts_tokenizer *tok);
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainenvoid fts_tokenizer_unref(struct fts_tokenizer **tok);
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen/* Reset FTS tokenizer state */
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainenvoid fts_tokenizer_reset(struct fts_tokenizer *tok);
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen Returns 1 if *token_r was returned, 0 if more data is needed, -1 on error.
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen This function should be called with the same data+size until it
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen returns 0. After that fts_tokenizer_final() should be called until it
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen returns 0 to flush out the final token(s).
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen data must contain only valid complete UTF-8 sequences, but otherwise it
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen may be broken into however small pieces. (Input to this function typically
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen comes from message-decoder, which returns only complete UTF-8 sequences.) */
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainenint fts_tokenizer_next(struct fts_tokenizer *tok,
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen/* Returns same as fts_tokenizer_next(). */
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainenint fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r,
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainen const char **error_r);
b716136fc47efd434d60be5db262b4013e375fa9Timo Sirainenconst char *fts_tokenizer_name(const struct fts_tokenizer *tok);