fts-tokenizer.c revision 16dd1fd16f0c6dbd4a057327370b432684e301ec
02c335c23bf5fa225a467c19f2c063fb0dc7b8c3Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch#include "lib.h"
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch#include "array.h"
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch#include "istream.h"
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch#include "str.h"
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch#include "strfuncs.h"
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch#include "fts-tokenizer.h"
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch#include "fts-tokenizer-private.h"
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
bdd36cfdba3ff66d25570a9ff568d69e1eb543cfTimo Sirainenstatic ARRAY(const struct fts_tokenizer *) fts_tokenizer_classes;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschvoid fts_tokenizers_init(void)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (!array_is_created(&fts_tokenizer_classes)) {
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch fts_tokenizer_register(fts_tokenizer_generic);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch fts_tokenizer_register(fts_tokenizer_email_address);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch }
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschvoid fts_tokenizers_deinit(void)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (array_is_created(&fts_tokenizer_classes))
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch array_free(&fts_tokenizer_classes);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch/* private */
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschvoid fts_tokenizer_register(const struct fts_tokenizer *tok_class)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (!array_is_created(&fts_tokenizer_classes))
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch i_array_init(&fts_tokenizer_classes, FTS_TOKENIZER_CLASSES_NR);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch array_append(&fts_tokenizer_classes, &tok_class, 1);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch/* private */
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschvoid fts_tokenizer_unregister(const struct fts_tokenizer *tok_class)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch const struct fts_tokenizer *const *tp;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch unsigned int idx;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch array_foreach(&fts_tokenizer_classes, tp) {
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (strcmp((*tp)->name, tok_class->name) == 0) {
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch idx = array_foreach_idx(&fts_tokenizer_classes, tp);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch array_delete(&fts_tokenizer_classes, idx, 1);
d9a7e950a9cd21f2b4a90ec7759fca9e8fcc7995Timo Sirainen if (array_count(&fts_tokenizer_classes) == 0)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch array_free(&fts_tokenizer_classes);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch }
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch }
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch i_unreached();
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschconst struct fts_tokenizer *fts_tokenizer_find(const char *name)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch const struct fts_tokenizer *const *tp;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch array_foreach(&fts_tokenizer_classes, tp) {
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (strcmp((*tp)->name, name) == 0)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return *tp;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch }
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return NULL;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschconst char *fts_tokenizer_name(const struct fts_tokenizer *tok)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return tok->name;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschint fts_tokenizer_create(const struct fts_tokenizer *tok_class,
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch struct fts_tokenizer *parent,
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch const char *const *settings,
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch struct fts_tokenizer **tokenizer_r,
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch const char **error_r)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch struct fts_tokenizer *tok;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch const char *empty_settings = NULL;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch i_assert(settings == NULL || str_array_length(settings) % 2 == 0);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (settings == NULL)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch settings = &empty_settings;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (tok_class->v->create(settings, &tok, error_r) < 0) {
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch *tokenizer_r = 0;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return -1;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch }
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->refcount = 1;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->prev_reply_finished = TRUE;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (parent != NULL) {
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch fts_tokenizer_ref(parent);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->parent = parent;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->parent_input = buffer_create_dynamic(default_pool, 128);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch }
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch *tokenizer_r = tok;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return 0;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschvoid fts_tokenizer_ref(struct fts_tokenizer *tok)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch i_assert(tok->refcount > 0);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->refcount++;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschvoid fts_tokenizer_unref(struct fts_tokenizer **_tok)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch struct fts_tokenizer *tok = *_tok;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch i_assert(tok->refcount > 0);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch *_tok = NULL;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (--tok->refcount > 0)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (tok->parent_input != NULL)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch buffer_free(&tok->parent_input);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (tok->parent != NULL)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch fts_tokenizer_unref(&tok->parent);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->v->destroy(tok);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschstatic int
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschfts_tokenizer_next_self(struct fts_tokenizer *tok,
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch const unsigned char *data, size_t size,
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch const char **token_r)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch int ret = 0;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch size_t skip = 0;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch i_assert(tok->prev_reply_finished ||
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch (data == tok->prev_data && size == tok->prev_size));
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (tok->prev_reply_finished) {
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch /* whole new data */
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch ret = tok->v->next(tok, data, size, &skip, token_r);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch } else {
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch /* continuing previous data */
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch i_assert(tok->prev_skip <= size);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch ret = tok->v->next(tok, data + tok->prev_skip,
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch size - tok->prev_skip, &skip, token_r);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch }
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (ret > 0) {
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch i_assert(skip <= size - tok->prev_skip);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->prev_data = data;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->prev_size = size;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->prev_skip = tok->prev_skip + skip;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->prev_reply_finished = FALSE;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch } else if (ret == 0) {
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch /* we need a new data block */
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->prev_data = NULL;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->prev_size = 0;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->prev_skip = 0;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->prev_reply_finished = TRUE;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch }
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return ret;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschvoid fts_tokenizer_reset(struct fts_tokenizer *tok)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->v->reset(tok);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschint fts_tokenizer_next(struct fts_tokenizer *tok,
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch const unsigned char *data, size_t size,
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch const char **token_r)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch int ret;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch switch (tok->parent_state) {
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch case FTS_TOKENIZER_PARENT_STATE_ADD_DATA:
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch ret = fts_tokenizer_next_self(tok, data, size, token_r);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (ret <= 0 || tok->parent == NULL || tok->skip_parents)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return ret;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch buffer_set_used_size(tok->parent_input, 0);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch buffer_append(tok->parent_input, *token_r, strlen(*token_r));
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->parent_state++;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch /* fall through */
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch case FTS_TOKENIZER_PARENT_STATE_NEXT_OUTPUT:
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch ret = fts_tokenizer_next(tok->parent, tok->parent_input->data,
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->parent_input->used, token_r);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (ret != 0)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return ret;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->parent_state++;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch /* fall through */
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch case FTS_TOKENIZER_PARENT_STATE_FINALIZE:
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch ret = fts_tokenizer_next(tok->parent, NULL, 0, token_r);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch if (ret != 0)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return ret;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch /* we're finished sending this token to parent tokenizer.
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch see if our own tokenizer has more tokens available */
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch tok->parent_state = FTS_TOKENIZER_PARENT_STATE_ADD_DATA;
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return fts_tokenizer_next(tok, data, size, token_r);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch default:
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch i_unreached();
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch }
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Boschint fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r)
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch{
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch return fts_tokenizer_next(tok, NULL, 0, token_r);
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch}
f9511e684858bf5f6ac77ab12254b85b737beae8Stephan Bosch