fts-tokenizer.c revision 12952c18d10fa83be65059471139c2fdc8a00c3d
e59faf65ce864fe95dc00f5d52b8323cdbd0608aTimo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
08d6658a4e2ec8104cd1307f6baa75fdb07a24f8Mark Washenberger#include "lib.h"
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen#include "array.h"
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen#include "istream.h"
2fbc2a7c65d30e46803195ebb4547176b85c22c7Timo Sirainen#include "str.h"
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen#include "strfuncs.h"
3c9783956dea385b322cd7fa6bf8c98c17a907a0Timo Sirainen#include "fts-tokenizer.h"
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen#include "fts-tokenizer-private.h"
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainenstatic ARRAY(const struct fts_tokenizer *) fts_tokenizer_classes;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainenvoid fts_tokenizers_init(void)
a84eb0599fa1d796206eaed65c4e3239f0799276Timo Sirainen{
a84eb0599fa1d796206eaed65c4e3239f0799276Timo Sirainen if (!array_is_created(&fts_tokenizer_classes)) {
a84eb0599fa1d796206eaed65c4e3239f0799276Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen }
a84eb0599fa1d796206eaed65c4e3239f0799276Timo Sirainen}
a84eb0599fa1d796206eaed65c4e3239f0799276Timo Sirainen
a84eb0599fa1d796206eaed65c4e3239f0799276Timo Sirainenvoid fts_tokenizers_deinit(void)
a84eb0599fa1d796206eaed65c4e3239f0799276Timo Sirainen{
a84eb0599fa1d796206eaed65c4e3239f0799276Timo Sirainen if (array_is_created(&fts_tokenizer_classes))
43d3ea2780b5f8557ede7b4c039e8f56cb8d357dTimo Sirainen array_free(&fts_tokenizer_classes);
a84eb0599fa1d796206eaed65c4e3239f0799276Timo Sirainen}
e25885d4c7c4b392c66bbf26a9b892362d90f001Timo Sirainen
e25885d4c7c4b392c66bbf26a9b892362d90f001Timo Sirainen/* private */
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainenvoid fts_tokenizer_register(const struct fts_tokenizer *tok_class)
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen{
ab0d9eecd85f74acae18fe88529302e0776cc500Timo Sirainen if (!array_is_created(&fts_tokenizer_classes))
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen i_array_init(&fts_tokenizer_classes, FTS_TOKENIZER_CLASSES_NR);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen array_append(&fts_tokenizer_classes, &tok_class, 1);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen}
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen/* private */
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainenvoid fts_tokenizer_unregister(const struct fts_tokenizer *tok_class)
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen{
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen const struct fts_tokenizer *const *tp;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen unsigned int idx;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen array_foreach(&fts_tokenizer_classes, tp) {
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen if (strcmp((*tp)->name, tok_class->name) == 0) {
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen idx = array_foreach_idx(&fts_tokenizer_classes, tp);
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen array_delete(&fts_tokenizer_classes, idx, 1);
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen if (array_count(&fts_tokenizer_classes) == 0)
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen array_free(&fts_tokenizer_classes);
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen return;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen }
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen }
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen i_unreached();
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen}
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainenconst struct fts_tokenizer *fts_tokenizer_find(const char *name)
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen{
2cfe9983ce7a6280636ee12beccc2e865111967bTimo Sirainen const struct fts_tokenizer *const *tp;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen array_foreach(&fts_tokenizer_classes, tp) {
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen if (strcmp((*tp)->name, name) == 0)
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen return *tp;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen }
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen return NULL;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen}
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainenconst char *fts_tokenizer_name(const struct fts_tokenizer *tok)
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen{
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen return tok->name;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen}
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainenstatic void fts_tokenizer_self_reset(struct fts_tokenizer *tok)
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen{
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen tok->prev_data = NULL;
2e1e493b248dec0127b1eabeea5a8bc330378fcdTimo Sirainen tok->prev_size = 0;
b7828b34f6d81cdea62761932a1da1a444a29bcdTimo Sirainen tok->prev_skip = 0;
2e1e493b248dec0127b1eabeea5a8bc330378fcdTimo Sirainen tok->prev_reply_finished = TRUE;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen}
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainenint fts_tokenizer_create(const struct fts_tokenizer *tok_class,
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen struct fts_tokenizer *parent,
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen const char *const *settings,
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen struct fts_tokenizer **tokenizer_r,
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen const char **error_r)
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen{
59151b71059df1190acd75d8717ed04a7920c862Timo Sirainen struct fts_tokenizer *tok;
59151b71059df1190acd75d8717ed04a7920c862Timo Sirainen const char *empty_settings = NULL;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen i_assert(settings == NULL || str_array_length(settings) % 2 == 0);
48010d123abfac8cb19f33f1fe12f33a7090089eTimo Sirainen
48010d123abfac8cb19f33f1fe12f33a7090089eTimo Sirainen if (settings == NULL)
48010d123abfac8cb19f33f1fe12f33a7090089eTimo Sirainen settings = &empty_settings;
48010d123abfac8cb19f33f1fe12f33a7090089eTimo Sirainen
48010d123abfac8cb19f33f1fe12f33a7090089eTimo Sirainen if (tok_class->v->create(settings, &tok, error_r) < 0) {
48010d123abfac8cb19f33f1fe12f33a7090089eTimo Sirainen *tokenizer_r = 0;
48010d123abfac8cb19f33f1fe12f33a7090089eTimo Sirainen return -1;
48010d123abfac8cb19f33f1fe12f33a7090089eTimo Sirainen }
48010d123abfac8cb19f33f1fe12f33a7090089eTimo Sirainen tok->refcount = 1;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen fts_tokenizer_self_reset(tok);
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen if (parent != NULL) {
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen fts_tokenizer_ref(parent);
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen tok->parent = parent;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen tok->parent_input = buffer_create_dynamic(default_pool, 128);
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen }
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen *tokenizer_r = tok;
f8a86fdfb0048f9c87bf223373b35416ceb5856bTimo Sirainen return 0;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen}
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainenvoid fts_tokenizer_ref(struct fts_tokenizer *tok)
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen{
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen i_assert(tok->refcount > 0);
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
d5cebe7f98e63d4e2822863ef2faa4971e8b3a5dTimo Sirainen tok->refcount++;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen}
d5cebe7f98e63d4e2822863ef2faa4971e8b3a5dTimo Sirainen
d5cebe7f98e63d4e2822863ef2faa4971e8b3a5dTimo Sirainenvoid fts_tokenizer_unref(struct fts_tokenizer **_tok)
d5cebe7f98e63d4e2822863ef2faa4971e8b3a5dTimo Sirainen{
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen struct fts_tokenizer *tok = *_tok;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen i_assert(tok->refcount > 0);
d5cebe7f98e63d4e2822863ef2faa4971e8b3a5dTimo Sirainen *_tok = NULL;
648d24583c1574441c4fa0331a90bd4d6e7996c5Timo Sirainen
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen if (--tok->refcount > 0)
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen return;
6c2c5f20760b06bfb4a40b0ee2ef5ab016bc41f0Timo Sirainen
if (tok->parent_input != NULL)
buffer_free(&tok->parent_input);
if (tok->parent != NULL)
fts_tokenizer_unref(&tok->parent);
tok->v->destroy(tok);
}
static int
fts_tokenizer_next_self(struct fts_tokenizer *tok,
const unsigned char *data, size_t size,
const char **token_r, const char **error_r)
{
int ret = 0;
size_t skip = 0;
i_assert(tok->prev_reply_finished ||
(data == tok->prev_data && size == tok->prev_size));
if (tok->prev_reply_finished) {
/* whole new data */
ret = tok->v->next(tok, data, size, &skip, token_r, error_r);
} else {
/* continuing previous data */
i_assert(tok->prev_skip <= size);
ret = tok->v->next(tok, data + tok->prev_skip,
size - tok->prev_skip, &skip,
token_r, error_r);
}
if (ret > 0) {
i_assert(skip <= size - tok->prev_skip);
tok->prev_data = data;
tok->prev_size = size;
tok->prev_skip = tok->prev_skip + skip;
tok->prev_reply_finished = FALSE;
} else if (ret == 0) {
/* we need a new data block */
fts_tokenizer_self_reset(tok);
}
return ret;
}
void fts_tokenizer_reset(struct fts_tokenizer *tok)
{
tok->v->reset(tok);
fts_tokenizer_self_reset(tok);
}
int fts_tokenizer_next(struct fts_tokenizer *tok,
const unsigned char *data, size_t size,
const char **token_r, const char **error_r)
{
int ret;
switch (tok->parent_state) {
case FTS_TOKENIZER_PARENT_STATE_ADD_DATA:
ret = fts_tokenizer_next_self(tok, data, size, token_r, error_r);
if (ret <= 0 || tok->parent == NULL || tok->skip_parents)
break;
buffer_set_used_size(tok->parent_input, 0);
buffer_append(tok->parent_input, *token_r, strlen(*token_r));
tok->parent_state++;
/* fall through */
case FTS_TOKENIZER_PARENT_STATE_NEXT_OUTPUT:
ret = fts_tokenizer_next(tok->parent, tok->parent_input->data,
tok->parent_input->used, token_r, error_r);
if (ret != 0)
break;
tok->parent_state++;
/* fall through */
case FTS_TOKENIZER_PARENT_STATE_FINALIZE:
ret = fts_tokenizer_next(tok->parent, NULL, 0, token_r, error_r);
if (ret != 0)
break;
/* we're finished sending this token to parent tokenizer.
see if our own tokenizer has more tokens available */
tok->parent_state = FTS_TOKENIZER_PARENT_STATE_ADD_DATA;
return fts_tokenizer_next(tok, data, size, token_r, error_r);
default:
i_unreached();
}
/* we must not be returning empty tokens */
i_assert(ret <= 0 || (*token_r)[0] != '\0');
return ret;
}
int fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r,
const char **error_r)
{
return fts_tokenizer_next(tok, NULL, 0, token_r, error_r);
}