fts-tokenizer.c revision c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "lib.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "array.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "istream.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "str.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "strfuncs.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "fts-tokenizer.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "fts-tokenizer-private.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo SirainenARRAY(struct fts_tokenizer) fts_tokenizer_classes;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* private */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenvoid fts_tokenizer_register(const struct fts_tokenizer *tok_class)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (!array_is_created(&fts_tokenizer_classes))
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_array_init(&fts_tokenizer_classes, FTS_TOKENIZER_CLASSES_NR);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen array_append(&fts_tokenizer_classes, tok_class, 1);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* private */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenvoid fts_tokenizer_unregister(const struct fts_tokenizer *tok_class)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_tokenizer *tp;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int idx;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen array_foreach(&fts_tokenizer_classes, tp) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (strcmp(tp->name, tok_class->name) == 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen idx = array_foreach_idx(&fts_tokenizer_classes, tp);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen array_delete(&fts_tokenizer_classes, idx, 1);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (array_count(&fts_tokenizer_classes) == 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen array_free(&fts_tokenizer_classes);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_unreached();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_tokenizer *fts_tokenizer_find(const char *name)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_tokenizer *tp;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen array_foreach(&fts_tokenizer_classes, tp) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (strcmp(tp->name, name) == 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return tp;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return NULL;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenint fts_tokenizer_create(const struct fts_tokenizer *tok_class,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer *parent,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *const *settings,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer **tokenizer_r,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char **error_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer *tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *empty_settings = NULL;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(settings == NULL || str_array_length(settings) % 2 == 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (settings == NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen settings = &empty_settings;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok_class->v->create(settings, &tok, error_r) < 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *tokenizer_r = 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->refcount = 1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_reply_finished = TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (parent != NULL) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_ref(parent);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->parent = parent;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->parent_input = buffer_create_dynamic(default_pool, 128);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *tokenizer_r = tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenvoid fts_tokenizer_ref(struct fts_tokenizer *tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(tok->refcount > 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->refcount++;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenvoid fts_tokenizer_unref(struct fts_tokenizer **_tok)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer *tok = *_tok;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(tok->refcount > 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *_tok = NULL;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (--tok->refcount > 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->parent_input != NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_free(&tok->parent_input);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->parent != NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_unref(&tok->parent);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->v->destroy(tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const char *
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_next_self(struct fts_tokenizer *tok,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const unsigned char *data, size_t size)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *token;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen size_t skip = 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(tok->prev_reply_finished ||
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen (data == tok->prev_data && size == tok->prev_size));
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (tok->prev_reply_finished) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* whole new data */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen token = tok->v->next(tok, data, size, &skip);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen } else {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* continuing previous data */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(tok->prev_skip <= size);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen token = tok->v->next(tok, data + tok->prev_skip,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen size - tok->prev_skip, &skip);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (token != NULL) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(skip <= size - tok->prev_skip);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_data = data;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_size = size;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_skip = tok->prev_skip + skip;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_reply_finished = FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen } else {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* we need a new data block */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_data = NULL;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_size = 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_skip = 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->prev_reply_finished = TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return token;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst char *
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_tokenizer_next(struct fts_tokenizer *tok,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const unsigned char *data, size_t size)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *token;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen switch (tok->parent_state) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen case FTS_TOKENIZER_PARENT_STATE_ADD_DATA:
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen token = fts_tokenizer_next_self(tok, data, size);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (token == NULL || tok->parent == NULL || tok->skip_parents)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return token;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_set_used_size(tok->parent_input, 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen buffer_append(tok->parent_input, token, strlen(token));
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->parent_state++;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* fall through */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen case FTS_TOKENIZER_PARENT_STATE_NEXT_OUTPUT:
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen token = fts_tokenizer_next(tok->parent, tok->parent_input->data,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->parent_input->used);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (token != NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return token;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->parent_state++;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* fall through */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen case FTS_TOKENIZER_PARENT_STATE_FINALIZE:
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen token = fts_tokenizer_next(tok->parent, NULL, 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (token != NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return token;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* we're finished sending this token to parent tokenizer.
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen see if our own tokenizer has more tokens available */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen tok->parent_state = FTS_TOKENIZER_PARENT_STATE_ADD_DATA;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return fts_tokenizer_next(tok, data, size);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen default:
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_unreached();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}