lucene-wrapper.cc revision acc72c40c5bfe818013e0ae9c9e73eb90ae8fbb1
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen/* Copyright (c) 2006-2010 Dovecot authors, see the included COPYING file */
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainenextern "C" {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "lib.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "array.h"
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#include "unichar.h"
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#include "hash.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "hex-binary.h"
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#include "unlink-directory.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "mail-index.h"
fd2f5fbc1f07aa93e2214a28cdf02437fb7d06c8Timo Sirainen#include "mail-search.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "mail-namespace.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "mailbox-list-private.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "mail-storage.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "fts-expunge-log.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "fts-lucene-plugin.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "lucene-wrapper.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen#include <sys/stat.h>
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen#ifdef HAVE_LUCENE_TEXTCAT
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen# include <libtextcat/textcat.h>
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#else
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#ifdef HAVE_LUCENE_EXTTEXTCAT
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen# include <libexttextcat/textcat.h>
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#endif
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#endif
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen};
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#include <CLucene.h>
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#include <CLucene/util/CLStreams.h>
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include <CLucene/search/MultiPhraseQuery.h>
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "SnowballAnalyzer.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen/* Lucene's default is 10000. Use it here also.. */
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen#define MAX_TERMS_PER_DOCUMENT 10000
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen#define FTS_LUCENE_MAX_SEARCH_TERMS 1000
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#define LUCENE_LOCK_OVERRIDE_SECS 60
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenusing namespace lucene::document;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainenusing namespace lucene::index;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenusing namespace lucene::search;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainenusing namespace lucene::queryParser;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenusing namespace lucene::analysis;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenusing namespace lucene::analysis;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainenusing namespace lucene::util;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainenstruct lucene_query {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen Query *query;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen BooleanClause::Occur occur;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen};
72cbf33ae81fde08384d30c779ff540752d9256cTimo SirainenARRAY_DEFINE_TYPE(lucene_query, struct lucene_query);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainenstruct lucene_analyzer {
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen char *lang;
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen Analyzer *analyzer;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen};
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainenstruct lucene_index {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen char *path;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen struct mailbox_list *list;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen struct fts_lucene_settings set;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen normalizer_func_t *normalizer;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen wchar_t mailbox_guid[MAILBOX_GUID_HEX_LENGTH + 1];
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen IndexReader *reader;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen IndexWriter *writer;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen IndexSearcher *searcher;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen buffer_t *normalizer_buf;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen Analyzer *default_analyzer, *cur_analyzer;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen ARRAY(struct lucene_analyzer) analyzers;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen Document *doc;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen uint32_t prev_uid, prev_part_idx;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen};
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainenstruct rescan_context {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen struct lucene_index *index;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen struct mailbox *box;
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen guid_128_t box_guid;
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen int box_ret;
50c4a9739b55370b1d3950d7b3ec2f7cd2ed5f49Timo Sirainen
50c4a9739b55370b1d3950d7b3ec2f7cd2ed5f49Timo Sirainen pool_t pool;
50c4a9739b55370b1d3950d7b3ec2f7cd2ed5f49Timo Sirainen HASH_TABLE(uint8_t *, uint8_t *) seen_mailbox_guids;
50c4a9739b55370b1d3950d7b3ec2f7cd2ed5f49Timo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen ARRAY_TYPE(seq_range) uids;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen struct seq_range_iter uids_iter;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen unsigned int uids_iter_n;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen uint32_t last_existing_uid;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen bool warned;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen};
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainenstatic void *textcat = NULL;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainenstatic bool textcat_broken = FALSE;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainenstatic int textcat_refcount = 0;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainenstatic void rescan_clear_unseen_mailboxes(struct lucene_index *index,
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen struct rescan_context *rescan_ctx);
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainenstruct lucene_index *lucene_index_init(const char *path,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen struct mailbox_list *list,
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen const struct fts_lucene_settings *set)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen struct lucene_index *index;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen unsigned int len;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index = i_new(struct lucene_index, 1);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->path = i_strdup(path);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->list = list;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (set != NULL) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->set = *set;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->normalizer = !set->normalize ? NULL :
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen mailbox_list_get_namespace(list)->user->default_normalizer;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen } else {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen /* this is valid only for doveadm dump, so it doesn't matter */
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->set.default_language = "";
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#ifdef HAVE_LUCENE_STEMMER
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (set == NULL || !set->no_snowball) {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->default_analyzer =
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen _CLNEW snowball::SnowballAnalyzer(index->normalizer,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->set.default_language);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen } else
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#endif
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->default_analyzer = _CLNEW standard::StandardAnalyzer();
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (index->normalizer != NULL) {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->normalizer_buf =
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen buffer_create_dynamic(default_pool, 1024);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_array_init(&index->analyzers, 32);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen textcat_refcount++;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen return index;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenvoid lucene_index_close(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen _CLDELETE(index->reader);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen _CLDELETE(index->writer);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen _CLDELETE(index->searcher);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenvoid lucene_index_deinit(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen struct lucene_analyzer *a;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen lucene_index_close(index);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen array_foreach_modifiable(&index->analyzers, a) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_free(a->lang);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen _CLDELETE(a->analyzer);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen array_free(&index->analyzers);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (--textcat_refcount == 0 && textcat != NULL) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#ifdef HAVE_LUCENE_TEXTCAT
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen textcat_Done(textcat);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#endif
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen textcat = NULL;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen _CLDELETE(index->default_analyzer);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (index->normalizer_buf != NULL)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen buffer_free(&index->normalizer_buf);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_free(index->path);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_free(index);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic void lucene_data_translate(struct lucene_index *index,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen wchar_t *data, unsigned int len)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const char *whitespace_chars = index->set.whitespace_chars;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen unsigned int i;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (*whitespace_chars == '\0')
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen for (i = 0; i < len; i++) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (strchr(whitespace_chars, data[i]) != NULL)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen data[i] = ' ';
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenvoid lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen wchar_t *dest, size_t destsize)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen ARRAY_TYPE(unichars) dest_arr;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen buffer_t buf = { 0, 0, { 0, 0, 0, 0, 0 } };
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_assert(sizeof(wchar_t) == sizeof(unichar_t));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen buffer_create_from_data(&buf, dest, sizeof(wchar_t) * destsize);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen array_create_from_buffer(&dest_arr, &buf, sizeof(wchar_t));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (uni_utf8_to_ucs4_n(src, srcsize, &dest_arr) < 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_unreached();
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_assert(array_count(&dest_arr)+1 == destsize);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen dest[destsize-1] = 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic const wchar_t *
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainent_lucene_utf8_to_tchar(struct lucene_index *index,
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen const char *str, bool translate)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen ARRAY_TYPE(unichars) dest_arr;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen const unichar_t *chars;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen wchar_t *ret;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen unsigned int len;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_assert(sizeof(wchar_t) == sizeof(unichar_t));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen t_array_init(&dest_arr, strlen(str) + 1);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen if (uni_utf8_to_ucs4(str, &dest_arr) < 0)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen i_unreached();
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen (void)array_append_space(&dest_arr);
0a51697f82fbd45a511710479e99efd42dc18453Timo Sirainen
0a51697f82fbd45a511710479e99efd42dc18453Timo Sirainen chars = array_get_modifiable(&dest_arr, &len);
0a51697f82fbd45a511710479e99efd42dc18453Timo Sirainen ret = (wchar_t *)chars;
0a51697f82fbd45a511710479e99efd42dc18453Timo Sirainen lucene_data_translate(index, ret, len - 1);
0a51697f82fbd45a511710479e99efd42dc18453Timo Sirainen return ret;
0a51697f82fbd45a511710479e99efd42dc18453Timo Sirainen}
0a51697f82fbd45a511710479e99efd42dc18453Timo Sirainen
0a51697f82fbd45a511710479e99efd42dc18453Timo Sirainenvoid lucene_index_select_mailbox(struct lucene_index *index,
0a51697f82fbd45a511710479e99efd42dc18453Timo Sirainen const wchar_t guid[MAILBOX_GUID_HEX_LENGTH])
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen memcpy(index->mailbox_guid, guid,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen MAILBOX_GUID_HEX_LENGTH * sizeof(wchar_t));
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->mailbox_guid[MAILBOX_GUID_HEX_LENGTH] = '\0';
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenvoid lucene_index_unselect_mailbox(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen memset(index->mailbox_guid, 0, sizeof(index->mailbox_guid));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic void lucene_handle_error(struct lucene_index *index, CLuceneError &err,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const char *msg)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const char *what = err.what();
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_error("lucene index %s: %s failed (#%d): %s",
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->path, msg, err.number(), what);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (index->list != NULL &&
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen (err.number() == CL_ERR_CorruptIndex ||
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen err.number() == CL_ERR_IO)) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen /* delete corrupted index. most IO errors are also about
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen missing files and other such corruption.. */
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (unlink_directory(index->path,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen UNLINK_DIRECTORY_FLAG_RMDIR) < 0 &&
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen errno != ENOENT)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_error("unlink_directory(%s) failed: %m", index->path);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen rescan_clear_unseen_mailboxes(index, NULL);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainenstatic int lucene_index_open(struct lucene_index *index)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen{
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen if (index->reader != NULL)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return 1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen if (!IndexReader::indexExists(index->path))
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen try {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->reader = IndexReader::open(index->path);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen } catch (CLuceneError &err) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen lucene_handle_error(index, err, "IndexReader::open()");
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return -1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return 1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic int lucene_index_open_search(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen int ret;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (index->searcher != NULL)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return 1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen if ((ret = lucene_index_open(index)) <= 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return ret;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->searcher = _CLNEW IndexSearcher(index->reader);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen return 1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic int
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenlucene_doc_get_uid(struct lucene_index *index, Document *doc, uint32_t *uid_r)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen Field *field = doc->getField(_T("uid"));
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen const TCHAR *uid = field == NULL ? NULL : field->stringValue();
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (uid == NULL) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_error("lucene: Corrupted FTS index %s: No UID for document",
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->path);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return -1;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen uint32_t num = 0;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen while (*uid != 0) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen num = num*10 + (*uid - '0');
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen uid++;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen *uid_r = num;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return 0;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen}
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic uint32_t
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenlucene_doc_get_part(struct lucene_index *index, Document *doc)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen Field *field = doc->getField(_T("part"));
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen const TCHAR *part = field == NULL ? NULL : field->stringValue();
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (part == NULL)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen return 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen uint32_t num = 0;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen while (*part != 0) {
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen num = num*10 + (*part - '0');
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen part++;
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return num;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenint lucene_index_get_last_uid(struct lucene_index *index, uint32_t *last_uid_r)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen int ret = 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen *last_uid_r = 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen if ((ret = lucene_index_open_search(index)) <= 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return ret;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen Term mailbox_term(_T("box"), index->mailbox_guid);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen TermQuery query(&mailbox_term);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen uint32_t last_uid = 0;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen try {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen Hits *hits = index->searcher->search(&query);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen for (size_t i = 0; i < hits->length(); i++) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen uint32_t uid;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (lucene_doc_get_uid(index, &hits->doc(i),
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen &uid) < 0) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen ret = -1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen break;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (uid > last_uid)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen last_uid = uid;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen }
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen _CLDELETE(hits);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen } catch (CLuceneError &err) {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen lucene_handle_error(index, err, "last_uid search");
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen ret = -1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen *last_uid_r = last_uid;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen return ret;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainenint lucene_index_get_doc_count(struct lucene_index *index, uint32_t *count_r)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen int ret;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
531fa12126fc7abf63244a7ed4505896a8694206Timo Sirainen if (index->reader == NULL) {
531fa12126fc7abf63244a7ed4505896a8694206Timo Sirainen lucene_index_close(index);
531fa12126fc7abf63244a7ed4505896a8694206Timo Sirainen if ((ret = lucene_index_open(index)) < 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return -1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (ret == 0) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen *count_r = 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen *count_r = index->reader->numDocs();
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic int lucene_settings_check(struct lucene_index *index)
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen struct fts_index_header hdr;
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen uint32_t set_checksum;
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen int ret = 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen set_checksum = fts_lucene_settings_checksum(&index->set);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen ret = fts_index_have_compatible_settings(index->list, set_checksum);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (ret != 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return ret;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen /* settings changed, rebuild index */
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (unlink_directory(index->path, UNLINK_DIRECTORY_FLAG_RMDIR) < 0) {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen i_error("unlink_directory(%s) failed: %m", index->path);
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen ret = -1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen } else {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen rescan_clear_unseen_mailboxes(index, NULL);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return ret;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenint lucene_index_build_init(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const char *lock_path;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen struct stat st;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen lucene_index_close(index);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen lock_path = t_strdup_printf("%s/write.lock", index->path);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (stat(lock_path, &st) == 0 &&
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen st.st_mtime < time(NULL) - LUCENE_LOCK_OVERRIDE_SECS) {
da5d50534cfca45d0aaaf0bdac17b287b4588809Timo Sirainen if (unlink(lock_path) < 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_error("unlink(%s) failed: %m", lock_path);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (lucene_settings_check(index) < 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return -1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen bool exists = IndexReader::indexExists(index->path);
2ca4cb08680aebb1474d762738cf436871f095fbTimo Sirainen try {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->writer = _CLNEW IndexWriter(index->path,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->default_analyzer,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen !exists);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen } catch (CLuceneError &err) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen lucene_handle_error(index, err, "IndexWriter()");
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return -1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen index->writer->setMaxFieldLength(MAX_TERMS_PER_DOCUMENT);
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen return 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
da5d50534cfca45d0aaaf0bdac17b287b4588809Timo Sirainen#ifdef HAVE_LUCENE_TEXTCAT
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic Analyzer *get_analyzer(struct lucene_index *index, const char *lang)
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen normalizer_func_t *normalizer = index->normalizer;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const struct lucene_analyzer *a;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen struct lucene_analyzer new_analyzer;
da5d50534cfca45d0aaaf0bdac17b287b4588809Timo Sirainen Analyzer *analyzer;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen array_foreach(&index->analyzers, a) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (strcmp(a->lang, lang) == 0)
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen return a->analyzer;
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen }
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen memset(&new_analyzer, 0, sizeof(new_analyzer));
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen new_analyzer.lang = i_strdup(lang);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen new_analyzer.analyzer =
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen _CLNEW snowball::SnowballAnalyzer(normalizer, lang);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen array_append_i(&index->analyzers.arr, &new_analyzer, 1);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen return new_analyzer.analyzer;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic void *textcat_init(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const char *textcat_dir = index->set.textcat_dir;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen unsigned int len;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (textcat_dir == NULL)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return NULL;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen /* textcat really wants the '/' suffix */
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen len = strlen(textcat_dir);
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen if (len > 0 && textcat_dir[len-1] != '/')
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen textcat_dir = t_strconcat(textcat_dir, "/", NULL);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return special_textcat_Init(index->set.textcat_conf, textcat_dir);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
50c4a9739b55370b1d3950d7b3ec2f7cd2ed5f49Timo Sirainen
50c4a9739b55370b1d3950d7b3ec2f7cd2ed5f49Timo Sirainenstatic Analyzer *
50c4a9739b55370b1d3950d7b3ec2f7cd2ed5f49Timo Sirainenguess_analyzer(struct lucene_index *index, const void *data, size_t size)
50c4a9739b55370b1d3950d7b3ec2f7cd2ed5f49Timo Sirainen{
50c4a9739b55370b1d3950d7b3ec2f7cd2ed5f49Timo Sirainen const char *lang;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (textcat_broken)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return NULL;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (textcat == NULL) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen textcat = textcat_init(index);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (textcat == NULL) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen textcat_broken = TRUE;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return NULL;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen }
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen /* try to guess the language */
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen lang = textcat_Classify(textcat, (const char *)data,
531fa12126fc7abf63244a7ed4505896a8694206Timo Sirainen I_MIN(size, 500));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const char *p = strchr(lang, ']');
56f45b3f3ae20e5c933701f4657dda5ef1916855Timo Sirainen if (lang[0] != '[' || p == NULL)
56f45b3f3ae20e5c933701f4657dda5ef1916855Timo Sirainen return NULL;
56f45b3f3ae20e5c933701f4657dda5ef1916855Timo Sirainen lang = t_strdup_until(lang+1, p);
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen if (strcmp(lang, index->set.default_language) == 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return index->default_analyzer;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen return get_analyzer(index, lang);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#else
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic Analyzer *
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenguess_analyzer(struct lucene_index *index ATTR_UNUSED,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const void *data ATTR_UNUSED, size_t size ATTR_UNUSED)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen return NULL;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen#endif
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainenstatic int lucene_index_build_flush(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen int ret = 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (index->doc == NULL)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen return 0;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen try {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->writer->addDocument(index->doc,
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->cur_analyzer != NULL ?
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->cur_analyzer :
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->default_analyzer);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen } catch (CLuceneError &err) {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen lucene_handle_error(index, err, "IndexWriter::addDocument()");
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen ret = -1;
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen }
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen _CLDELETE(index->doc);
531fa12126fc7abf63244a7ed4505896a8694206Timo Sirainen index->doc = NULL;
531fa12126fc7abf63244a7ed4505896a8694206Timo Sirainen index->cur_analyzer = NULL;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen return ret;
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen}
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainenint lucene_index_build_more(struct lucene_index *index, uint32_t uid,
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen uint32_t part_idx, const unsigned char *data,
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen size_t size, const char *hdr_name)
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen{
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen wchar_t id[MAX_INT_STRLEN];
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen size_t namesize, datasize;
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen if (uid != index->prev_uid || part_idx != index->prev_part_idx) {
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen if (lucene_index_build_flush(index) < 0)
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen return -1;
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen index->prev_uid = uid;
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen index->prev_part_idx = part_idx;
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen index->doc = _CLNEW Document();
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen swprintf(id, N_ELEMENTS(id), L"%u", uid);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->doc->add(*_CLNEW Field(_T("uid"), id, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (part_idx != 0) {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen swprintf(id, N_ELEMENTS(id), L"%u", part_idx);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->doc->add(*_CLNEW Field(_T("part"), id, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen }
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen index->doc->add(*_CLNEW Field(_T("box"), index->mailbox_guid, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen }
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen if (index->normalizer_buf != NULL) {
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen buffer_set_used_size(index->normalizer_buf, 0);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->normalizer(data, size, index->normalizer_buf);
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen data = (const unsigned char *)index->normalizer_buf->data;
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen size = index->normalizer_buf->used;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen }
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen datasize = uni_utf8_strlen_n(data, size) + 1;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen wchar_t *dest, *dest_free = NULL;
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen if (datasize < 4096)
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen dest = t_new(wchar_t, datasize);
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen else
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen dest = dest_free = i_new(wchar_t, datasize);
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen lucene_utf8_n_to_tchar(data, size, dest, datasize);
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen lucene_data_translate(index, dest, datasize);
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen if (hdr_name != NULL) {
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen /* hdr_name should be ASCII, but don't break in case it isn't */
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen hdr_name = t_str_lcase(hdr_name);
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen namesize = uni_utf8_strlen(hdr_name) + 1;
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen wchar_t wname[namesize];
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen lucene_utf8_n_to_tchar((const unsigned char *)hdr_name,
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen strlen(hdr_name), wname, namesize);
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen index->doc->add(*_CLNEW Field(_T("hdr"), wname, Field::STORE_NO | Field::INDEX_TOKENIZED));
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen index->doc->add(*_CLNEW Field(_T("hdr"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen if (fts_header_want_indexed(hdr_name))
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen index->doc->add(*_CLNEW Field(wname, dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen } else if (size > 0) {
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen if (index->cur_analyzer == NULL)
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen index->cur_analyzer = guess_analyzer(index, data, size);
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen index->doc->add(*_CLNEW Field(_T("body"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen }
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen i_free(dest_free);
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen return 0;
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen}
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainenint lucene_index_build_deinit(struct lucene_index *index)
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen{
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen int ret = 0;
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen
44ff75ca53188056ff5a3e50428e3f2078800b3cTimo Sirainen if (index->prev_uid == 0) {
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen /* no changes. */
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen return 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen index->prev_uid = 0;
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen index->prev_part_idx = 0;
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen if (index->writer == NULL) {
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen lucene_index_close(index);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return -1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen if (lucene_index_build_flush(index) < 0)
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen ret = -1;
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen try {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->writer->close();
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen } catch (CLuceneError &err) {
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen lucene_handle_error(index, err, "IndexWriter::close()");
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen ret = -1;
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen }
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen lucene_index_close(index);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen return ret;
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen}
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainenstatic int
4bbee99b3aef449a9a2a11a5b5cf1ca486915c49Timo Sirainenwcharguid_to_guid(guid_128_t dest, const wchar_t *src)
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen{
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen buffer_t buf = { 0, 0, { 0, 0, 0, 0, 0 } };
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen char src_chars[GUID_128_SIZE*2 + 1];
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen unsigned int i;
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen for (i = 0; i < sizeof(src_chars)-1; i++) {
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen if ((src[i] >= '0' && src[i] <= '9') ||
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen (src[i] >= 'a' && src[i] <= 'f'))
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen src_chars[i] = src[i];
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen else
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen return -1;
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen }
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen if (src[i] != '\0')
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen return -1;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen src_chars[i] = '\0';
4bbee99b3aef449a9a2a11a5b5cf1ca486915c49Timo Sirainen
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen buffer_create_from_data(&buf, dest, GUID_128_SIZE);
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen return hex_to_binary(src_chars, &buf);
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen}
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic int
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenrescan_get_uids(struct mailbox *box, ARRAY_TYPE(seq_range) *uids)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen struct mailbox_status status;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (mailbox_get_status(box, STATUS_MESSAGES, &status) < 0)
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen return -1;
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen if (status.messages > 0) T_BEGIN {
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen ARRAY_TYPE(seq_range) seqs;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen t_array_init(&seqs, 2);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen seq_range_array_add_range(&seqs, 1, status.messages);
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen mailbox_get_uid_range(box, &seqs, uids);
a40d26f83af808a0ea1e212c001d682a96d870b0Timo Sirainen } T_END;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen return 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen}
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic int rescan_finish(struct rescan_context *ctx)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen{
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen int ret;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen ret = fts_index_set_last_uid(ctx->box, ctx->last_existing_uid);
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen mailbox_free(&ctx->box);
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen return ret;
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen}
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainenstatic int
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainenfts_lucene_get_mailbox_guid(struct lucene_index *index, Document *doc,
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen guid_128_t guid_r)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen{
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen Field *field = doc->getField(_T("box"));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const TCHAR *box_guid = field == NULL ? NULL : field->stringValue();
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (box_guid == NULL) {
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen i_error("lucene: Corrupted FTS index %s: No mailbox for document",
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen index->path);
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen return -1;
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen }
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen if (wcharguid_to_guid(guid_r, box_guid) < 0) {
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen i_error("lucene: Corrupted FTS index %s: "
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen "box field not in expected format", index->path);
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen return -1;
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen }
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen return 0;
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen}
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainenstatic int
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainenrescan_open_mailbox(struct rescan_context *ctx, Document *doc)
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen{
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen guid_128_t guid, *guidp;
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen int ret;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (fts_lucene_get_mailbox_guid(ctx->index, doc, guid) < 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return 0;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (memcmp(guid, ctx->box_guid, sizeof(guid)) == 0) {
531fa12126fc7abf63244a7ed4505896a8694206Timo Sirainen /* same as last one */
531fa12126fc7abf63244a7ed4505896a8694206Timo Sirainen return ctx->box_ret;
531fa12126fc7abf63244a7ed4505896a8694206Timo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen memcpy(ctx->box_guid, guid, sizeof(ctx->box_guid));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen guidp = p_new(ctx->pool, guid_128_t, 1);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen memcpy(guidp, guid, sizeof(*guidp));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen hash_table_insert(ctx->seen_mailbox_guids, guidp, guidp);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (ctx->box != NULL)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen rescan_finish(ctx);
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen ctx->box = mailbox_alloc_guid(ctx->index->list, guid,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen (enum mailbox_flags)0);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen if (mailbox_open(ctx->box) < 0) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen enum mail_error error;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const char *errstr;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen errstr = mailbox_get_last_error(ctx->box, &error);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen if (error == MAIL_ERROR_NOTFOUND)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen ret = 0;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen else {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen i_error("lucene: Couldn't open mailbox %s: %s",
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen mailbox_get_vname(ctx->box), errstr);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen ret = -1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen }
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen mailbox_free(&ctx->box);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen ctx->box_ret = ret;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen return ret;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen }
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen if (mailbox_sync(ctx->box, (enum mailbox_sync_flags)0) < 0) {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen i_error("lucene: Failed to sync mailbox %s: %s",
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen mailbox_get_vname(ctx->box),
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen mailbox_get_last_error(ctx->box, NULL));
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen mailbox_free(&ctx->box);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen ctx->box_ret = -1;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return -1;
a40d26f83af808a0ea1e212c001d682a96d870b0Timo Sirainen }
a40d26f83af808a0ea1e212c001d682a96d870b0Timo Sirainen
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen array_clear(&ctx->uids);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen rescan_get_uids(ctx->box, &ctx->uids);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen
a40d26f83af808a0ea1e212c001d682a96d870b0Timo Sirainen ctx->warned = FALSE;
a40d26f83af808a0ea1e212c001d682a96d870b0Timo Sirainen ctx->last_existing_uid = 0;
ctx->uids_iter_n = 0;
seq_range_array_iter_init(&ctx->uids_iter, &ctx->uids);
ctx->box_ret = 1;
return 1;
}
static int
rescan_next(struct rescan_context *ctx, Document *doc)
{
uint32_t lucene_uid, idx_uid;
if (lucene_doc_get_uid(ctx->index, doc, &lucene_uid) < 0)
return 0;
if (seq_range_array_iter_nth(&ctx->uids_iter, ctx->uids_iter_n,
&idx_uid)) {
if (idx_uid == lucene_uid) {
ctx->uids_iter_n++;
ctx->last_existing_uid = idx_uid;
return 1;
}
if (idx_uid < lucene_uid) {
/* lucene is missing an UID from the middle. delete
the rest of the messages from this mailbox and
reindex. */
if (!ctx->warned) {
i_warning("lucene: Mailbox %s "
"missing UIDs in the middle",
mailbox_get_vname(ctx->box));
ctx->warned = TRUE;
}
} else {
/* UID has been expunged from index. delete from
lucene as well. */
}
return 0;
} else {
/* the rest of the messages have been expunged from index */
return 0;
}
}
static void rescan_clear_unseen_mailboxes(struct lucene_index *index,
struct rescan_context *rescan_ctx)
{
const enum mailbox_list_iter_flags iter_flags =
(enum mailbox_list_iter_flags)
(MAILBOX_LIST_ITER_NO_AUTO_BOXES |
MAILBOX_LIST_ITER_RETURN_NO_FLAGS);
struct mailbox_list_iterate_context *iter;
const struct mailbox_info *info;
struct mailbox *box;
struct mailbox_metadata metadata;
struct fts_index_header hdr;
memset(&hdr, 0, sizeof(hdr));
hdr.settings_checksum = fts_lucene_settings_checksum(&index->set);
iter = mailbox_list_iter_init(index->list, "*", iter_flags);
while ((info = mailbox_list_iter_next(iter)) != NULL) {
box = mailbox_alloc(index->list, info->vname,
(enum mailbox_flags)0);
if (mailbox_open(box) == 0 &&
mailbox_get_metadata(box, MAILBOX_METADATA_GUID,
&metadata) == 0 &&
(rescan_ctx == NULL ||
hash_table_lookup(rescan_ctx->seen_mailbox_guids,
metadata.guid) == NULL)) {
/* this mailbox had no records in lucene index.
make sure its last indexed uid is 0 */
(void)fts_index_set_header(box, &hdr);
}
mailbox_free(&box);
}
(void)mailbox_list_iter_deinit(&iter);
}
int lucene_index_rescan(struct lucene_index *index)
{
static const TCHAR *sort_fields[] = { _T("box"), _T("uid"), NULL };
struct rescan_context ctx;
guid_128_t guid;
bool failed = false;
int ret;
i_assert(index->list != NULL);
if ((ret = lucene_index_open_search(index)) < 0)
return ret;
Term term(_T("box"), _T("*"));
WildcardQuery query(&term);
Sort sort(sort_fields);
memset(&ctx, 0, sizeof(ctx));
ctx.index = index;
ctx.pool = pool_alloconly_create("guids", 1024);
hash_table_create(&ctx.seen_mailbox_guids, ctx.pool, 0,
guid_128_hash, guid_128_cmp);
i_array_init(&ctx.uids, 128);
if (ret > 0) try {
Hits *hits = index->searcher->search(&query, &sort);
for (size_t i = 0; i < hits->length(); i++) {
ret = rescan_open_mailbox(&ctx, &hits->doc(i));
if (ret > 0)
ret = rescan_next(&ctx, &hits->doc(i));
if (ret < 0)
failed = true;
else if (ret == 0)
index->reader->deleteDocument(hits->id(i));
}
_CLDELETE(hits);
index->reader->close();
lucene_index_close(index);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "rescan search");
failed = true;
}
if (ctx.box != NULL)
rescan_finish(&ctx);
array_free(&ctx.uids);
rescan_clear_unseen_mailboxes(index, &ctx);
hash_table_destroy(&ctx.seen_mailbox_guids);
pool_unref(&ctx.pool);
return failed ? -1 : 0;
}
static void guid128_to_wguid(const guid_128_t guid,
wchar_t wguid_hex[MAILBOX_GUID_HEX_LENGTH + 1])
{
buffer_t buf = { 0, 0, { 0, 0, 0, 0, 0 } };
unsigned char guid_hex[MAILBOX_GUID_HEX_LENGTH];
unsigned int i;
buffer_create_from_data(&buf, guid_hex, MAILBOX_GUID_HEX_LENGTH);
binary_to_hex_append(&buf, guid, GUID_128_SIZE);
for (i = 0; i < MAILBOX_GUID_HEX_LENGTH; i++)
wguid_hex[i] = guid_hex[i];
wguid_hex[i] = '\0';
}
static bool
lucene_index_add_uid_filter(BooleanQuery *query,
const struct fts_expunge_log_read_record *rec)
{
struct seq_range_iter iter;
wchar_t wuid[MAX_INT_STRLEN];
unsigned int n;
uint32_t uid;
/* RangeQuery and WildcardQuery work by enumerating through all terms
that match them, and then adding TermQueries for them. So we can
simply do the same directly, and if it looks like there are too
many terms just go through everything. */
if (seq_range_count(&rec->uids) > FTS_LUCENE_MAX_SEARCH_TERMS)
return false;
seq_range_array_iter_init(&iter, &rec->uids); n = 0;
while (seq_range_array_iter_nth(&iter, n++, &uid)) {
swprintf(wuid, N_ELEMENTS(wuid), L"%u", uid);
Term *term = _CLNEW Term(_T("uid"), wuid);
query->add(_CLNEW TermQuery(term), true, BooleanClause::SHOULD);
_CLDECDELETE(term);
}
return true;
}
static int
lucene_index_expunge_record(struct lucene_index *index,
const struct fts_expunge_log_read_record *rec)
{
int ret;
if ((ret = lucene_index_open_search(index)) <= 0)
return ret;
BooleanQuery query;
BooleanQuery uids_query;
if (lucene_index_add_uid_filter(&uids_query, rec))
query.add(&uids_query, BooleanClause::MUST);
wchar_t wguid[MAILBOX_GUID_HEX_LENGTH + 1];
guid128_to_wguid(rec->mailbox_guid, wguid);
Term term(_T("box"), wguid);
TermQuery mailbox_query(&term);
query.add(&mailbox_query, BooleanClause::MUST);
try {
Hits *hits = index->searcher->search(&query);
for (size_t i = 0; i < hits->length(); i++) {
uint32_t uid;
if (lucene_doc_get_uid(index, &hits->doc(i),
&uid) < 0 ||
seq_range_exists(&rec->uids, uid))
index->reader->deleteDocument(hits->id(i));
}
_CLDELETE(hits);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "expunge search");
ret = -1;
}
return ret < 0 ? -1 : 0;
}
int lucene_index_expunge_from_log(struct lucene_index *index,
struct fts_expunge_log *log)
{
struct fts_expunge_log_read_ctx *ctx;
const struct fts_expunge_log_read_record *rec;
int ret = 0, ret2;
ctx = fts_expunge_log_read_begin(log);
while ((rec = fts_expunge_log_read_next(ctx)) != NULL) {
if (lucene_index_expunge_record(index, rec) < 0) {
ret = -1;
break;
}
}
try {
if (index->reader != NULL)
index->reader->close();
lucene_index_close(index);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "expunge delete");
ret = -1;
}
ret2 = fts_expunge_log_read_end(&ctx);
if (ret < 0 || ret2 < 0)
return -1;
return ret2;
}
int lucene_index_optimize(struct lucene_index *index)
{
int ret = 0;
if (!IndexReader::indexExists(index->path))
return 0;
if (IndexReader::isLocked(index->path))
IndexReader::unlock(index->path);
IndexWriter *writer = NULL;
try {
writer = _CLNEW IndexWriter(index->path, index->default_analyzer, false);
writer->optimize();
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "IndexWriter::optimize()");
ret = -1;
}
if (writer != NULL)
_CLDELETE(writer);
return ret;
}
// Mostly copy&pasted from CLucene's QueryParser
static Query* getFieldQuery(Analyzer *analyzer, const TCHAR* _field, const TCHAR* queryText, bool fuzzy) {
// Use the analyzer to get all the tokens, and then build a TermQuery,
// PhraseQuery, or nothing based on the term count
StringReader reader(queryText);
TokenStream* source = analyzer->tokenStream(_field, &reader);
CLVector<CL_NS(analysis)::Token*, Deletor::Object<CL_NS(analysis)::Token> > v;
CL_NS(analysis)::Token* t = NULL;
int32_t positionCount = 0;
bool severalTokensAtSamePosition = false;
while (true) {
t = _CLNEW Token();
try {
Token* _t = source->next(t);
if (_t == NULL) _CLDELETE(t);
}_CLCATCH_ERR(CL_ERR_IO, _CLLDELETE(source);_CLLDELETE(t);,{
t = NULL;
});
if (t == NULL)
break;
v.push_back(t);
if (t->getPositionIncrement() != 0)
positionCount += t->getPositionIncrement();
else
severalTokensAtSamePosition = true;
}
try {
source->close();
}
_CLCATCH_ERR_CLEANUP(CL_ERR_IO, {_CLLDELETE(source);_CLLDELETE(t);} ); /* cleanup */
_CLLDELETE(source);
if (v.size() == 0)
return NULL;
else if (v.size() == 1) {
Term* tm = _CLNEW Term(_field, v.at(0)->termBuffer());
Query* ret;
if (fuzzy)
ret = _CLNEW FuzzyQuery( tm );
else
ret = _CLNEW TermQuery( tm );
_CLDECDELETE(tm);
return ret;
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
BooleanQuery* q = _CLNEW BooleanQuery(true);
for(size_t i=0; i<v.size(); i++ ){
Term* tm = _CLNEW Term(_field, v.at(i)->termBuffer());
q->add(_CLNEW TermQuery(tm), true, BooleanClause::SHOULD);
_CLDECDELETE(tm);
}
return q;
}else {
MultiPhraseQuery* mpq = _CLNEW MultiPhraseQuery();
CLArrayList<Term*> multiTerms;
int32_t position = -1;
for (size_t i = 0; i < v.size(); i++) {
t = v.at(i);
if (t->getPositionIncrement() > 0 && multiTerms.size() > 0) {
ValueArray<Term*> termsArray(multiTerms.size());
multiTerms.toArray(termsArray.values);
mpq->add(&termsArray,position);
multiTerms.clear();
}
position += t->getPositionIncrement();
multiTerms.push_back(_CLNEW Term(_field, t->termBuffer()));
}
ValueArray<Term*> termsArray(multiTerms.size());
multiTerms.toArray(termsArray.values);
mpq->add(&termsArray,position);
return mpq;
}
}else {
PhraseQuery* pq = _CLNEW PhraseQuery();
int32_t position = -1;
for (size_t i = 0; i < v.size(); i++) {
t = v.at(i);
Term* tm = _CLNEW Term(_field, t->termBuffer());
position += t->getPositionIncrement();
pq->add(tm,position);
_CLDECDELETE(tm);
}
return pq;
}
}
}
static Query *
lucene_get_query_str(struct lucene_index *index,
const TCHAR *key, const char *str, bool fuzzy)
{
const TCHAR *wvalue;
Analyzer *analyzer;
if (index->normalizer_buf != NULL) {
buffer_set_used_size(index->normalizer_buf, 0);
index->normalizer(str, strlen(str), index->normalizer_buf);
buffer_append_c(index->normalizer_buf, '\0');
str = (const char *)index->normalizer_buf->data;
}
wvalue = t_lucene_utf8_to_tchar(index, str, TRUE);
analyzer = guess_analyzer(index, str, strlen(str));
if (analyzer == NULL)
analyzer = index->default_analyzer;
return getFieldQuery(analyzer, key, wvalue, fuzzy);
}
static Query *
lucene_get_query(struct lucene_index *index,
const TCHAR *key, const struct mail_search_arg *arg)
{
return lucene_get_query_str(index, key, arg->value.str, arg->fuzzy);
}
static bool
lucene_add_definite_query(struct lucene_index *index,
ARRAY_TYPE(lucene_query) &queries,
struct mail_search_arg *arg, bool and_args)
{
Query *q;
if (arg->match_not && !and_args) {
/* FIXME: we could handle this by doing multiple queries.. */
return false;
}
switch (arg->type) {
case SEARCH_TEXT: {
BooleanQuery *bq = _CLNEW BooleanQuery();
Query *q1 = lucene_get_query(index, _T("hdr"), arg);
Query *q2 = lucene_get_query(index, _T("body"), arg);
if (q1 == NULL && q2 == NULL)
q = NULL;
else {
if (q1 != NULL)
bq->add(q1, true, BooleanClause::SHOULD);
if (q2 != NULL)
bq->add(q2, true, BooleanClause::SHOULD);
q = bq;
}
break;
}
case SEARCH_BODY:
q = lucene_get_query(index, _T("body"), arg);
break;
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
case SEARCH_HEADER_COMPRESS_LWSP:
if (!fts_header_want_indexed(arg->hdr_field_name) ||
*arg->value.str == '\0')
return false;
q = lucene_get_query(index,
t_lucene_utf8_to_tchar(index, t_str_lcase(arg->hdr_field_name), FALSE),
arg);
break;
default:
return false;
}
if (q == NULL) {
/* couldn't handle this search after all (e.g. trying to search
a stop word) */
return false;
}
struct lucene_query *lq = array_append_space(&queries);
lq->query = q;
if (!and_args)
lq->occur = BooleanClause::SHOULD;
else if (!arg->match_not)
lq->occur = BooleanClause::MUST;
else
lq->occur = BooleanClause::MUST_NOT;
return true;
}
static bool
lucene_add_maybe_query(struct lucene_index *index,
ARRAY_TYPE(lucene_query) &queries,
struct mail_search_arg *arg, bool and_args)
{
Query *q = NULL;
if (arg->match_not) {
/* FIXME: we could handle this by doing multiple queries.. */
return false;
}
switch (arg->type) {
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
case SEARCH_HEADER_COMPRESS_LWSP:
if (*arg->value.str == '\0') {
/* checking potential existence of the header name */
q = lucene_get_query_str(index, _T("hdr"),
t_str_lcase(arg->hdr_field_name), FALSE);
break;
}
if (fts_header_want_indexed(arg->hdr_field_name))
return false;
/* we can check if the search key exists in some header and
filter out the messages that have no chance of matching */
q = lucene_get_query(index, _T("hdr"), arg);
break;
default:
return false;
}
if (q == NULL) {
/* couldn't handle this search after all (e.g. trying to search
a stop word) */
return false;
}
struct lucene_query *lq = array_append_space(&queries);
lq->query = q;
if (!and_args)
lq->occur = BooleanClause::SHOULD;
else if (!arg->match_not)
lq->occur = BooleanClause::MUST;
else
lq->occur = BooleanClause::MUST_NOT;
return true;
return true;
}
static bool queries_have_non_must_nots(ARRAY_TYPE(lucene_query) &queries)
{
const struct lucene_query *lq;
array_foreach(&queries, lq) {
if (lq->occur != BooleanClause::MUST_NOT)
return TRUE;
}
return FALSE;
}
static void search_query_add(BooleanQuery &query,
ARRAY_TYPE(lucene_query) &queries)
{
BooleanQuery *search_query = _CLNEW BooleanQuery();
const struct lucene_query *lq;
if (queries_have_non_must_nots(queries)) {
array_foreach(&queries, lq)
search_query->add(lq->query, true, lq->occur);
query.add(search_query, true, BooleanClause::MUST);
} else {
array_foreach(&queries, lq)
search_query->add(lq->query, true, BooleanClause::SHOULD);
query.add(search_query, true, BooleanClause::MUST_NOT);
}
}
static int
lucene_index_search(struct lucene_index *index,
ARRAY_TYPE(lucene_query) &queries,
struct fts_result *result, ARRAY_TYPE(seq_range) *uids_r)
{
struct fts_score_map *score;
int ret = 0;
BooleanQuery query;
search_query_add(query, queries);
Term mailbox_term(_T("box"), index->mailbox_guid);
TermQuery mailbox_query(&mailbox_term);
query.add(&mailbox_query, BooleanClause::MUST);
try {
Hits *hits = index->searcher->search(&query);
uint32_t last_uid = 0;
if (result != NULL)
result->scores_sorted = true;
for (size_t i = 0; i < hits->length(); i++) {
uint32_t uid;
if (lucene_doc_get_uid(index, &hits->doc(i),
&uid) < 0) {
ret = -1;
break;
}
if (seq_range_array_add(uids_r, uid)) {
/* duplicate result */
} else if (result != NULL) {
if (uid < last_uid)
result->scores_sorted = false;
last_uid = uid;
score = array_append_space(&result->scores);
score->uid = uid;
score->score = hits->score(i);
}
}
_CLDELETE(hits);
return ret;
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "search");
return -1;
}
}
int lucene_index_lookup(struct lucene_index *index,
struct mail_search_arg *args, bool and_args,
struct fts_result *result)
{
struct mail_search_arg *arg;
if (lucene_index_open_search(index) <= 0)
return -1;
ARRAY_TYPE(lucene_query) def_queries;
t_array_init(&def_queries, 16);
bool have_definites = false;
for (arg = args; arg != NULL; arg = arg->next) {
if (lucene_add_definite_query(index, def_queries, arg, and_args)) {
arg->match_always = true;
have_definites = true;
}
}
if (have_definites) {
if (lucene_index_search(index, def_queries, result,
&result->definite_uids) < 0)
return -1;
}
if (have_definites) {
/* FIXME: mixing up definite + maybe queries is broken. if the
definite query matched, it'll just assume that the maybe
queries matched as well */
return 0;
}
ARRAY_TYPE(lucene_query) maybe_queries;
t_array_init(&maybe_queries, 16);
bool have_maybies = false;
for (arg = args; arg != NULL; arg = arg->next) {
if (lucene_add_maybe_query(index, maybe_queries, arg, and_args)) {
arg->match_always = true;
have_maybies = true;
}
}
if (have_maybies) {
if (lucene_index_search(index, maybe_queries, NULL,
&result->maybe_uids) < 0)
return -1;
}
return 0;
}
static int
lucene_index_search_multi(struct lucene_index *index,
HASH_TABLE_TYPE(wguid_result) guids,
ARRAY_TYPE(lucene_query) &queries,
struct fts_multi_result *result)
{
struct fts_score_map *score;
int ret = 0;
BooleanQuery query;
search_query_add(query, queries);
BooleanQuery mailbox_query;
struct hash_iterate_context *iter;
void *key, *value;
iter = hash_table_iterate_init(guids);
while (hash_table_iterate(iter, guids, &key, &value)) {
Term *term = _CLNEW Term(_T("box"), (wchar_t *)key);
TermQuery *q = _CLNEW TermQuery(term);
mailbox_query.add(q, true, BooleanClause::SHOULD);
}
hash_table_iterate_deinit(&iter);
query.add(&mailbox_query, BooleanClause::MUST);
try {
Hits *hits = index->searcher->search(&query);
for (size_t i = 0; i < hits->length(); i++) {
uint32_t uid;
Field *field = hits->doc(i).getField(_T("box"));
const TCHAR *box_guid = field == NULL ? NULL : field->stringValue();
if (box_guid == NULL) {
i_error("lucene: Corrupted FTS index %s: No mailbox for document",
index->path);
ret = -1;
break;
}
struct fts_result *br =
hash_table_lookup(guids, box_guid);
if (br == NULL) {
i_warning("lucene: Returned unexpected mailbox with GUID %ls", box_guid);
continue;
}
if (lucene_doc_get_uid(index, &hits->doc(i),
&uid) < 0) {
ret = -1;
break;
}
if (!array_is_created(&br->definite_uids)) {
p_array_init(&br->definite_uids, result->pool, 32);
p_array_init(&br->scores, result->pool, 32);
}
if (seq_range_array_add(&br->definite_uids, uid)) {
/* duplicate result */
} else {
score = array_append_space(&br->scores);
score->uid = uid;
score->score = hits->score(i);
}
}
_CLDELETE(hits);
return ret;
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "multi search");
return -1;
}
}
int lucene_index_lookup_multi(struct lucene_index *index,
HASH_TABLE_TYPE(wguid_result) guids,
struct mail_search_arg *args, bool and_args,
struct fts_multi_result *result)
{
struct mail_search_arg *arg;
if (lucene_index_open_search(index) <= 0)
return -1;
ARRAY_TYPE(lucene_query) def_queries;
t_array_init(&def_queries, 16);
bool have_definites = false;
for (arg = args; arg != NULL; arg = arg->next) {
if (lucene_add_definite_query(index, def_queries, arg, and_args)) {
arg->match_always = true;
have_definites = true;
}
}
if (have_definites) {
if (lucene_index_search_multi(index, guids,
def_queries, result) < 0)
return -1;
}
return 0;
}
struct lucene_index_iter {
struct lucene_index *index;
struct lucene_index_record rec;
Term *term;
WildcardQuery *query;
Sort *sort;
Hits *hits;
size_t i;
bool failed;
};
struct lucene_index_iter *
lucene_index_iter_init(struct lucene_index *index)
{
static const TCHAR *sort_fields[] = { _T("box"), _T("uid"), NULL };
struct lucene_index_iter *iter;
int ret;
iter = i_new(struct lucene_index_iter, 1);
iter->index = index;
if ((ret = lucene_index_open_search(index)) <= 0) {
if (ret < 0)
iter->failed = true;
return iter;
}
iter->term = _CLNEW Term(_T("box"), _T("*"));
iter->query = _CLNEW WildcardQuery(iter->term);
iter->sort = _CLNEW Sort(sort_fields);
try {
iter->hits = index->searcher->search(iter->query, iter->sort);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "rescan search");
iter->failed = true;
}
return iter;
}
const struct lucene_index_record *
lucene_index_iter_next(struct lucene_index_iter *iter)
{
if (iter->hits == NULL)
return NULL;
if (iter->i == iter->hits->length())
return NULL;
Document *doc = &iter->hits->doc(iter->i);
iter->i++;
memset(&iter->rec, 0, sizeof(iter->rec));
(void)fts_lucene_get_mailbox_guid(iter->index, doc,
iter->rec.mailbox_guid);
(void)lucene_doc_get_uid(iter->index, doc, &iter->rec.uid);
iter->rec.part_num = lucene_doc_get_part(iter->index, doc);
return &iter->rec;
}
int lucene_index_iter_deinit(struct lucene_index_iter **_iter)
{
struct lucene_index_iter *iter = *_iter;
int ret = iter->failed ? -1 : 0;
*_iter = NULL;
if (iter->hits != NULL)
_CLDELETE(iter->hits);
if (iter->query != NULL) {
_CLDELETE(iter->query);
_CLDELETE(iter->sort);
_CLDELETE(iter->term);
}
i_free(iter);
return ret;
}
void lucene_shutdown(void)
{
_lucene_shutdown();
}