fts-user.c revision fac865bad1ba10e85d80b63dedfd3493a65510d4
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen#include "lib.h"
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen#include "module-context.h"
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen#include "mail-user.h"
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen#include "fts-language.h"
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen#include "fts-filter.h"
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen#include "fts-tokenizer.h"
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen#include "fts-user.h"
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen#define FTS_USER_CONTEXT(obj) \
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen MODULE_CONTEXT(obj, fts_user_module)
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen#define FTS_DEFAULT_TOKENIZERS "generic email-address"
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen#define FTS_DEFAULT_FILTERS "normalizer-icu snowball"
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainenstruct fts_user {
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen union mail_user_module_context module_ctx;
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen struct fts_language_list *lang_list;
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen struct fts_tokenizer *index_tokenizer, *search_tokenizer;
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen struct fts_user_language *data_lang;
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen ARRAY_TYPE(fts_user_language) languages;
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen};
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainenstatic MODULE_CONTEXT_DEFINE_INIT(fts_user_module,
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen &mail_user_module_register);
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainenstatic const char *const *str_keyvalues_to_array(const char *str)
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen{
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen const char *key, *value, *const *keyvalues;
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen ARRAY_TYPE(const_string) arr;
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen unsigned int i;
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen if (str == NULL)
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen return NULL;
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen t_array_init(&arr, 8);
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen keyvalues = t_strsplit_spaces(str, " ");
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen for (i = 0; keyvalues[i] != NULL; i++) {
09caccbfa7a47fbcd428b0618dfdf46d1679390aTimo Sirainen value = strchr(keyvalues[i], '=');
09caccbfa7a47fbcd428b0618dfdf46d1679390aTimo Sirainen if (value != NULL)
09caccbfa7a47fbcd428b0618dfdf46d1679390aTimo Sirainen key = t_strdup_until(keyvalues[i], value++);
09caccbfa7a47fbcd428b0618dfdf46d1679390aTimo Sirainen else {
09caccbfa7a47fbcd428b0618dfdf46d1679390aTimo Sirainen key = keyvalues[i];
09caccbfa7a47fbcd428b0618dfdf46d1679390aTimo Sirainen value = "";
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen }
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen array_append(&arr, &key, 1);
01f54478a7c69b88ab13840c99bbab19a0d7d754Timo Sirainen array_append(&arr, &value, 1);
01f54478a7c69b88ab13840c99bbab19a0d7d754Timo Sirainen }
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen array_append_zero(&arr);
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen return array_idx(&arr, 0);
01f54478a7c69b88ab13840c99bbab19a0d7d754Timo Sirainen}
01f54478a7c69b88ab13840c99bbab19a0d7d754Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainenstatic int
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainenfts_user_init_languages(struct mail_user *user, struct fts_user *fuser,
01f54478a7c69b88ab13840c99bbab19a0d7d754Timo Sirainen const char **error_r)
01f54478a7c69b88ab13840c99bbab19a0d7d754Timo Sirainen{
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen const char *languages, *unknown;
e6837a4f1118565d2ac460dce140705d56f779b3Timo Sirainen const char *lang_config[3] = {NULL, NULL, NULL};
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen languages = mail_user_plugin_getenv(user, "fts_languages");
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen if (languages == NULL) {
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen *error_r = "fts_languages setting is missing";
77a8c99da71844aaf0fa3036960473024d19f471Timo Sirainen return -1;
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen }
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen lang_config[1] = mail_user_plugin_getenv(user, "fts_language_config");
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen fuser->lang_list = fts_language_list_init(lang_config);
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen if (lang_config[1] != NULL)
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen lang_config[0] = "fts_language_config";
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen if (!fts_language_list_add_names(fuser->lang_list, languages, &unknown)) {
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen *error_r = t_strdup_printf(
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen "fts_languages: Unknown language '%s'", unknown);
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen return -1;
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen }
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen if (array_count(fts_language_list_get_all(fuser->lang_list)) == 0) {
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen *error_r = "fts_languages setting is empty";
c1252a5812eb11fcb81508b9ed37597a5bc84100Timo Sirainen return -1;
}
return 0;
}
static int
fts_user_create_filters(struct mail_user *user, const struct fts_language *lang,
struct fts_filter **filter_r, const char **error_r)
{
const struct fts_filter *filter_class;
struct fts_filter *filter = NULL, *parent = NULL;
const char *filters_key, *const *filters, *filter_set_name;
const char *str, *error, *set_key;
unsigned int i;
int ret = 0;
/* try to get the language-specific filters first */
filters_key = t_strconcat("fts_filters_", lang->name, NULL);
str = mail_user_plugin_getenv(user, filters_key);
if (str == NULL) {
/* fallback to global filters */
filters_key = "fts_filters";
str = mail_user_plugin_getenv(user, filters_key);
if (str == NULL) {
str = FTS_DEFAULT_FILTERS;
filters_key = "fts_filters(built-in default)";
}
}
filters = t_strsplit_spaces(str, " ");
for (i = 0; filters[i] != NULL; i++) {
filter_class = fts_filter_find(filters[i]);
if (filter_class == NULL) {
*error_r = t_strdup_printf("%s: Unknown filter '%s'",
filters_key, filters[i]);
ret = -1;
break;
}
/* try the language-specific setting first */
filter_set_name = t_str_replace(filters[i], '-', '_');
set_key = t_strdup_printf("fts_filters_%s_%s",
lang->name, filter_set_name);
str = mail_user_plugin_getenv(user, set_key);
if (str == NULL) {
set_key = t_strdup_printf("fts_filters_%s", filter_set_name);
str = mail_user_plugin_getenv(user, set_key);
}
if (fts_filter_create(filter_class, parent, lang,
str_keyvalues_to_array(str),
&filter, &error) < 0) {
*error_r = t_strdup_printf("%s: %s", set_key, error);
ret = -1;
break;
}
if (parent != NULL)
fts_filter_unref(&parent);
parent = filter;
}
if (ret < 0) {
if (parent != NULL)
fts_filter_unref(&parent);
return -1;
}
*filter_r = filter;
return 0;
}
static int
fts_user_create_tokenizer(struct mail_user *user,
struct fts_tokenizer **tokenizer_r, bool search,
const char **error_r)
{
const struct fts_tokenizer *tokenizer_class;
struct fts_tokenizer *tokenizer = NULL, *parent = NULL;
const char *tokenizers_key, *const *tokenizers, *tokenizer_set_name;
const char *str, *error, *set_key;
unsigned int i;
int ret = 0;
tokenizers_key = "fts_tokenizers";
str = mail_user_plugin_getenv(user, tokenizers_key);
if (str == NULL)
str = FTS_DEFAULT_TOKENIZERS;
tokenizers = t_strsplit_spaces(str, " ");
for (i = 0; tokenizers[i] != NULL; i++) {
tokenizer_class = fts_tokenizer_find(tokenizers[i]);
if (tokenizer_class == NULL) {
*error_r = t_strdup_printf("%s: Unknown tokenizer '%s'",
tokenizers_key, tokenizers[i]);
ret = -1;
break;
}
tokenizer_set_name = t_str_replace(tokenizers[i], '-', '_');
set_key = t_strdup_printf("fts_tokenizers_%s", tokenizer_set_name);
str = mail_user_plugin_getenv(user, set_key);
/* tell the tokenizers that we're tokenizing a search string
(instead of tokenizing indexed data) */
if (search)
str = t_strconcat("search=yes ", str, NULL);
if (fts_tokenizer_create(tokenizer_class, parent,
str_keyvalues_to_array(str),
&tokenizer, &error) < 0) {
*error_r = t_strdup_printf("%s: %s", set_key, error);
ret = -1;
break;
}
if (parent != NULL)
fts_tokenizer_unref(&parent);
parent = tokenizer;
}
if (ret < 0) {
if (parent != NULL)
fts_tokenizer_unref(&parent);
return -1;
}
*tokenizer_r = tokenizer;
return 0;
}
static int fts_user_init_tokenizers(struct mail_user *user,
struct fts_user *fuser,
const char **error_r)
{
if (fts_user_create_tokenizer(user, &fuser->index_tokenizer, FALSE,
error_r) < 0)
return -1;
if (fts_user_create_tokenizer(user, &fuser->search_tokenizer, TRUE,
error_r) < 0)
return -1;
return 0;
}
struct fts_user_language *
fts_user_language_find(struct mail_user *user,
const struct fts_language *lang)
{
struct fts_user_language *const *user_langp;
struct fts_user *fuser = FTS_USER_CONTEXT(user);
array_foreach(&fuser->languages, user_langp) {
if (strcmp((*user_langp)->lang->name, lang->name) == 0)
return *user_langp;
}
return NULL;
}
struct fts_tokenizer *fts_user_get_index_tokenizer(struct mail_user *user)
{
struct fts_user *fuser = FTS_USER_CONTEXT(user);
return fuser->index_tokenizer;
}
struct fts_tokenizer *fts_user_get_search_tokenizer(struct mail_user *user)
{
struct fts_user *fuser = FTS_USER_CONTEXT(user);
return fuser->search_tokenizer;
}
static int fts_user_language_create(struct mail_user *user,
struct fts_user *fuser,
const struct fts_language *lang,
const char **error_r)
{
struct fts_filter *filter;
struct fts_user_language *user_lang;
if (fts_user_create_filters(user, lang, &filter, error_r) < 0)
return -1;
user_lang = p_new(user->pool, struct fts_user_language, 1);
user_lang->lang = lang;
user_lang->filter = filter;
array_append(&fuser->languages, &user_lang, 1);
return 0;
}
static int fts_user_languages_fill_all(struct mail_user *user,
struct fts_user *fuser,
const char **error_r)
{
const struct fts_language *const *langp;
array_foreach(fts_language_list_get_all(fuser->lang_list), langp) {
if (fts_user_language_create(user, fuser, *langp, error_r) < 0)
return -1;
}
return 0;
}
struct fts_language_list *fts_user_get_language_list(struct mail_user *user)
{
struct fts_user *fuser = FTS_USER_CONTEXT(user);
return fuser->lang_list;
}
const ARRAY_TYPE(fts_user_language) *
fts_user_get_all_languages(struct mail_user *user)
{
struct fts_user *fuser = FTS_USER_CONTEXT(user);
return &fuser->languages;
}
struct fts_user_language *fts_user_get_data_lang(struct mail_user *user)
{
struct fts_user *fuser = FTS_USER_CONTEXT(user);
struct fts_user_language *lang;
const char *error;
if (fuser->data_lang != NULL)
return fuser->data_lang;
lang = p_new(user->pool, struct fts_user_language, 1);
lang->lang = &fts_language_data;
if (fts_filter_create(fts_filter_lowercase, NULL, lang->lang, NULL,
&lang->filter, &error) < 0)
i_unreached();
i_assert(lang->filter != NULL);
fuser->data_lang = lang;
return fuser->data_lang;
}
static void fts_user_free(struct fts_user *fuser)
{
struct fts_user_language *const *user_langp;
if (fuser->lang_list != NULL)
fts_language_list_deinit(&fuser->lang_list);
array_foreach(&fuser->languages, user_langp) {
if ((*user_langp)->filter != NULL)
fts_filter_unref(&(*user_langp)->filter);
}
if (fuser->data_lang != NULL && fuser->data_lang->filter != NULL)
fts_filter_unref(&fuser->data_lang->filter);
if (fuser->index_tokenizer != NULL)
fts_tokenizer_unref(&fuser->index_tokenizer);
if (fuser->search_tokenizer != NULL)
fts_tokenizer_unref(&fuser->search_tokenizer);
}
int fts_mail_user_init(struct mail_user *user, const char **error_r)
{
struct fts_user *fuser;
fuser = p_new(user->pool, struct fts_user, 1);
p_array_init(&fuser->languages, user->pool, 4);
if (fts_user_init_languages(user, fuser, error_r) < 0) {
fts_user_free(fuser);
return -1;
}
if (fts_user_languages_fill_all(user, fuser, error_r) < 0 ||
fts_user_init_tokenizers(user, fuser, error_r) < 0) {
fts_user_free(fuser);
return -1;
}
MODULE_CONTEXT_SET(user, fts_user_module, fuser);
return 0;
}
void fts_mail_user_deinit(struct mail_user *user)
{
struct fts_user *fuser = FTS_USER_CONTEXT(user);
if (fuser != NULL)
fts_user_free(fuser);
}