bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2015-2018 Dovecot authors, see the included COPYING file */
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainenstatic void strings_deduplicate(ARRAY_TYPE(const_string) *arr)
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainenfts_search_arg_create_or(const struct mail_search_arg *orig_arg, pool_t pool,
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainen /* create the OR arg first as the parent */
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainen or_arg = p_new(pool, struct mail_search_arg, 1);
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainen /* now create all the child args for the OR */
23b586b2cf5760527529f9963c04875c8566a24dTimo Sirainen arg->match_not = FALSE; /* we copied this to the root OR */
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainenfts_backend_dovecot_expand_tokens(struct fts_filter *filter,
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen const char **error_r)
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainen /* first add the word exactly as it without any tokenization */
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainen /* then add it tokenized, but without filtering */
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainen /* add the word filtered */
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen ret = fts_filter_filter(filter, &token2, &error);
98fe03ecad7fcf0973584b9ab0b4dc4848881d56Timo Sirainen } else if (ret < 0) {
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen *error_r = t_strdup_printf("Couldn't filter search token: %s", error);
513b045d3cb2325250e74f0a92c144f9307eee44Timo Sirainen /* The filter dropped the token, which means it was
513b045d3cb2325250e74f0a92c144f9307eee44Timo Sirainen never even indexed. Ignore this word entirely in the
513b045d3cb2325250e74f0a92c144f9307eee44Timo Sirainen search query. */
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainen arg = fts_search_arg_create_or(orig_arg, pool, &tokens);
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainenfts_backend_dovecot_tokenize_lang(struct fts_user_language *user_lang,
23b586b2cf5760527529f9963c04875c8566a24dTimo Sirainen /* we want all the tokens found from the string to be found, so create
23b586b2cf5760527529f9963c04875c8566a24dTimo Sirainen a parent AND and place all the filtered token alternatives under
23b586b2cf5760527529f9963c04875c8566a24dTimo Sirainen and_arg = p_new(pool, struct mail_search_arg, 1);
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen /* reset tokenizer between search args in case there's any state left
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen from some previous failure */
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen fts_tokenizer_reset(user_lang->search_tokenizer);
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen while ((ret = fts_tokenizer_next(user_lang->search_tokenizer,
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen (const void *)orig_token,
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen while (ret >= 0 &&
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen (ret = fts_tokenizer_final(user_lang->search_tokenizer, &token, &error)) > 0) {
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen *error_r = t_strdup_printf("Couldn't tokenize search args: %s", error);
23b586b2cf5760527529f9963c04875c8566a24dTimo Sirainen /* nothing was actually expanded, remove the empty and_arg */
98fe03ecad7fcf0973584b9ab0b4dc4848881d56Timo Sirainenstatic int fts_search_arg_expand(struct fts_backend *backend, pool_t pool,
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainen const ARRAY_TYPE(fts_user_language) *languages;
23b586b2cf5760527529f9963c04875c8566a24dTimo Sirainen struct mail_search_arg *or_arg, *orig_arg = *argp;
dbf26a3ea43cd79fe88f01ec99c7d9440679b996Timo Sirainen const char *error, *orig_token = orig_arg->value.str;
4a3584d6abec84ffd4d430667c6a7c58bb3f8b1aTimo Sirainen (*argp)->type == SEARCH_HEADER_COMPRESS_LWSP) &&
ebcd7cf40e53c2bbc98f7f686e206cda5c0e3111Timo Sirainen !fts_header_has_language((*argp)->hdr_field_name)) {
ebcd7cf40e53c2bbc98f7f686e206cda5c0e3111Timo Sirainen /* use only the data-language */
ebcd7cf40e53c2bbc98f7f686e206cda5c0e3111Timo Sirainen languages = fts_user_get_data_languages(backend->ns->user);
ebcd7cf40e53c2bbc98f7f686e206cda5c0e3111Timo Sirainen languages = fts_user_get_all_languages(backend->ns->user);
23b586b2cf5760527529f9963c04875c8566a24dTimo Sirainen /* OR together all the different expansions for different languages.
23b586b2cf5760527529f9963c04875c8566a24dTimo Sirainen it's enough for one of them to match. */
23b586b2cf5760527529f9963c04875c8566a24dTimo Sirainen or_arg = p_new(pool, struct mail_search_arg, 1);
23b586b2cf5760527529f9963c04875c8566a24dTimo Sirainen if (fts_backend_dovecot_tokenize_lang(*langp, pool, or_arg,
aaed9e3ce98759e0cb1258fc14e1076b71791445Timo Sirainen /* we couldn't parse any tokens from the input */
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainenfts_search_args_expand_tree(struct fts_backend *backend, pool_t pool,
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainen for (; *argp != NULL; argp = &(*argp)->next) {
98fe03ecad7fcf0973584b9ab0b4dc4848881d56Timo Sirainen if (fts_search_args_expand_tree(backend, pool,
5ed7a7fd838ba316cee9c59244d263227eb2b0d8Timo Sirainen /* we're testing for the existence of
5ed7a7fd838ba316cee9c59244d263227eb2b0d8Timo Sirainen the header */
f784d5bb8edbec88829524135cfa100129f5384dTimo Sirainen /* fall through */
98fe03ecad7fcf0973584b9ab0b4dc4848881d56Timo Sirainen ret = fts_search_arg_expand(backend, pool, argp);
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainenint fts_search_args_expand(struct fts_backend *backend,
6eca434b47b7b700f7df80a0e1ce31d0fd45d1fdTimo Sirainen struct mail_search_arg *args_dup, *orig_args = args->args;
44ca7644e6df9e5ce7e7d0cc3767f63153c10bd7Timo Sirainen /* don't keep re-expanding every time the search args are used.
44ca7644e6df9e5ce7e7d0cc3767f63153c10bd7Timo Sirainen this is especially important to avoid an assert-crash in
44ca7644e6df9e5ce7e7d0cc3767f63153c10bd7Timo Sirainen index_search_result_update_flags(). */
98fe03ecad7fcf0973584b9ab0b4dc4848881d56Timo Sirainen /* duplicate the args, so if expansion fails we haven't changed
98fe03ecad7fcf0973584b9ab0b4dc4848881d56Timo Sirainen args_dup = mail_search_arg_dup(args->pool, args->args);
98fe03ecad7fcf0973584b9ab0b4dc4848881d56Timo Sirainen if (fts_search_args_expand_tree(backend, args->pool, &args_dup) < 0)
1537d20b852cbbf0d6971790b84e0cce5ca61307Timo Sirainen /* we'll need to re-simplify the args if we changed anything */
6eca434b47b7b700f7df80a0e1ce31d0fd45d1fdTimo Sirainen /* duplicated args aren't initialized */