fts-tokenizer.h revision 16dd1fd16f0c6dbd4a057327370b432684e301ec
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody Settings are given in the form of a const char * const *settings =
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody {"key, "value", "key2", "value2", NULL} array of string pairs. Some
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody keys, like "no_parent" and "search" are a sort of boolean and the
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody value does not matter, just mentioning the key enables the functionality.
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody The array has to be NULL terminated.
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody/* Email address header tokenizer that returns "user@domain.org" input as
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody "user@domain.org" token as well as passing it through to the parent
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody (generic) tokenizer, which also returns "user", "domain" and "org".
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody This allows searching the mails with their individual components, but also
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody allows doing an explicit "user@domain" search, which returns only mails
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody matching that exact address (instead of e.g. a mail with both user@domain2
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody and user2@domain words). */
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody "no_parent", Return only our tokens, no data for parent to process.
c398eca6b0fc6583687bd6fe2ee2dbcca2ae9387Timo Sirainen Defaults to disabled. Should normally not be needed.
c398eca6b0fc6583687bd6fe2ee2dbcca2ae9387Timo Sirainen "search" Remove addresses from parent data stream, so they are not processed
c398eca6b0fc6583687bd6fe2ee2dbcca2ae9387Timo Sirainen further. Defaults to disabled. Enable by defining the keyword (and any
c398eca6b0fc6583687bd6fe2ee2dbcca2ae9387Timo Sirainenextern const struct fts_tokenizer *fts_tokenizer_email_address;
c398eca6b0fc6583687bd6fe2ee2dbcca2ae9387Timo Sirainen/* Generic email content tokenizer. Cuts text into tokens. */
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody "maxlen" Maximum length of token, before an arbitary cut off is made.
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH.
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody "algorithm", accepted values are "simple" or "tr29". Defines the
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody method for looking for word boundaries. Simple is faster and will
2ac5f36aa7c2e7a07ba8815d43a6d7483f62e74cTimo Sirainen work for many texts, especially those using latin alphabets, but
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody leaves corner cases. The tr29 implements a version of Unicode
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody technical report 29 word boundary lookup. It might work better with
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody e.g. texts containing Katakana or hebrew characters, but it is not
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody possible to use a single algorithm for all existing languages. It
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody is also significantly slower than simple. The algorithms also
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody differ in some details, e.g. simple will cut "a.b" and tr29 will
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody not. The default is "simple" */
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmodyextern const struct fts_tokenizer *fts_tokenizer_generic;
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody Tokenizing workflow, find --> create --> filter --> destroy.
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody Do init before first use and deinit after all done.
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody/* Register all built-in tokenizers. */
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmodyconst struct fts_tokenizer *fts_tokenizer_find(const char *name);
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody/* Create a new tokenizer. The settings are described above. */
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmodyint fts_tokenizer_create(const struct fts_tokenizer *tok_class,
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody const char *const *settings,
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody const char **error_r);
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmodyvoid fts_tokenizer_ref(struct fts_tokenizer *tok);
98c59517ebce19556221065e9231f007bbdd0038Timo Sirainenvoid fts_tokenizer_unref(struct fts_tokenizer **tok);
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody/* Reset FTS tokenizer state */
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmodyvoid fts_tokenizer_reset(struct fts_tokenizer *tok);
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody Returns 1 if *token_r was returned, 0 if more data is needed, -1 on error.
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody This function should be called with the same data+size until it
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody returns 0. After that fts_tokenizer_final() should be called until it
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody returns 0 to flush out the final token(s).
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody data must contain only valid complete UTF-8 sequences, but otherwise it
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody may be broken into however small pieces. (Input to this function typically
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody comes from message-decoder, which returns only complete UTF-8 sequences.) */
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmodyint fts_tokenizer_next(struct fts_tokenizer *tok,
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody const char **token_r);
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmody/* Returns same as fts_tokenizer_next(). */
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmodyint fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r);
1d940afbc02516d8c3d016780e1223a779844a1ePhil Carmodyconst char *fts_tokenizer_name(const struct fts_tokenizer *tok);