fts-tokenizer.h revision 4ef1f9f3293965734e6e3c38c191ceb2246a721f
7e209b78ca757294dbbc15604c88673b3a6b0c39Timo Sirainen Settings are given in the form of a const char * const *settings =
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen {"key, "value", "key2", "value2", NULL} array of string pairs. Some
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen keys, like "no_parent" and "search" are a sort of boolean and the
636f017be100bce67d66fd3ae1544a47681efd33Timo Sirainen value does not matter, just mentioning the key enables the functionality.
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen The array has to be NULL terminated.
7a7d2aa11e46195e2d92d6c337d7e78052a5ce67Timo Sirainen/* Email address header tokenizer that returns "user@domain.org" input as
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen "user@domain.org" token as well as passing it through to the parent
5c2d695acf9f95ae0dcdda89c4d2391ceda4d672Timo Sirainen (generic) tokenizer, which also returns "user", "domain" and "org".
a8fe899601735459641edae975c0fa08be8482e2Timo Sirainen This allows searching the mails with their individual components, but also
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen allows doing an explicit "user@domain" search, which returns only mails
111a7dda02defa4d612468cfc3c40da5240645afTimo Sirainen matching that exact address (instead of e.g. a mail with both user@domain2
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen and user2@domain words). */
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen "no_parent", Return only our tokens, no data for parent to process.
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen Defaults to disabled. Should normally not be needed.
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen "search" Remove addresses from parent data stream, so they are not processed
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen further. Defaults to disabled. Enable by defining the keyword (and any
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainenextern const struct fts_tokenizer *fts_tokenizer_email_address;
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen#define FTS_TOKENIZER_EMAIL_ADDRESS_NAME "email-address"
111a7dda02defa4d612468cfc3c40da5240645afTimo Sirainen/* Generic email content tokenizer. Cuts text into tokens. */
111a7dda02defa4d612468cfc3c40da5240645afTimo Sirainen "maxlen" Maximum length of token, before an arbitary cut off is made.
8a524e87e44ae629cf90262e92f7972ea3450c35Timo Sirainen Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH.
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen "algorithm", accepted values are "simple" or "tr29". Defines the
db0735f9b388c5bcfb781b1b25015e898d63d953Timo Sirainen method for looking for word boundaries. Simple is faster and will
db0735f9b388c5bcfb781b1b25015e898d63d953Timo Sirainen work for many texts, especially those using latin alphabets, but
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen leaves corner cases. The tr29 implements a version of Unicode
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen technical report 29 word boundary lookup. It might work better with
5c2d695acf9f95ae0dcdda89c4d2391ceda4d672Timo Sirainen e.g. texts containing Katakana or hebrew characters, but it is not
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen possible to use a single algorithm for all existing languages. It
5c2d695acf9f95ae0dcdda89c4d2391ceda4d672Timo Sirainen is also significantly slower than simple. The algorithms also
db0735f9b388c5bcfb781b1b25015e898d63d953Timo Sirainen differ in some details, e.g. simple will cut "a.b" and tr29 will
d798962a54c5cda054d57a0cfc7e5f47dfa20f6eTimo Sirainen not. The default is "simple" */
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainenextern const struct fts_tokenizer *fts_tokenizer_generic;
6b265a8a9d1ce3b3a8033445e99c9035d62ffbc7Timo Sirainen Tokenizing workflow, find --> create --> filter --> destroy.
6b265a8a9d1ce3b3a8033445e99c9035d62ffbc7Timo Sirainen Do init before first use and deinit after all done.
6b265a8a9d1ce3b3a8033445e99c9035d62ffbc7Timo Sirainen/* Register all built-in tokenizers. */
5c2d695acf9f95ae0dcdda89c4d2391ceda4d672Timo Sirainenconst struct fts_tokenizer *fts_tokenizer_find(const char *name);
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen/* Create a new tokenizer. The settings are described above. */
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainenint fts_tokenizer_create(const struct fts_tokenizer *tok_class,
5c2d695acf9f95ae0dcdda89c4d2391ceda4d672Timo Sirainen const char *const *settings,
84078771687fabf75819918f0f3aecdc3ed08b36Timo Sirainen const char **error_r);
d798962a54c5cda054d57a0cfc7e5f47dfa20f6eTimo Sirainenvoid fts_tokenizer_ref(struct fts_tokenizer *tok);
5c2d695acf9f95ae0dcdda89c4d2391ceda4d672Timo Sirainenvoid fts_tokenizer_unref(struct fts_tokenizer **tok);
5c2d695acf9f95ae0dcdda89c4d2391ceda4d672Timo Sirainen/* Returns the next token, or NULL if more data is needed for the next token.
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen This function should be called with the same data+size until it returns
428fb4dc39c6e9b2eb36216c396dad6096a65f8fTimo Sirainen NULL. When the input is finished, this function should be still be called
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen with size=0 to flush out the final token(s).
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen data must contain only valid complete UTF-8 sequences, but otherwise it
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen may be broken into however small pieces. */
db0735f9b388c5bcfb781b1b25015e898d63d953Timo Sirainenconst char *fts_tokenizer_name(const struct fts_tokenizer *tok);