bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2006-2018 Dovecot authors, see the included COPYING file */
33502e55a9bf4cafcd184ca9b114c126e420f856Timo Sirainen/* Lucene's default is 10000. Use it here also.. */
4d72a99412a0577b026b64afc27975f28a58d071Timo Sirainen#define LUCENE_INDEX_CLOSE_TIMEOUT_MSECS (120*1000)
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo SirainenARRAY_DEFINE_TYPE(lucene_query, struct lucene_query);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen wchar_t mailbox_guid[MAILBOX_GUID_HEX_LENGTH + 1];
678d0463849ba777106eb7875f27db07a5d8e3dfTimo Sirainen HASH_TABLE(uint8_t *, uint8_t *) seen_mailbox_guids;
8d14b5fc9c1ea1fad788315fc98fea89796a56d0Timo Sirainenstatic void lucene_handle_error(struct lucene_index *index, CLuceneError &err,
8d14b5fc9c1ea1fad788315fc98fea89796a56d0Timo Sirainen const char *msg);
5b6470e0e2ef4012430cdeca7d9b89c1278a0ed4Timo Sirainenstatic void rescan_clear_unseen_mailboxes(struct lucene_index *index,
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainenstruct lucene_index *lucene_index_init(const char *path,
de3466de0dcc4b0da5a1205591cb1fb99eb1392fTimo Sirainen mailbox_list_get_namespace(list)->user->default_normalizer;
5b6470e0e2ef4012430cdeca7d9b89c1278a0ed4Timo Sirainen /* this is valid only for doveadm dump, so it doesn't matter */
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen index->default_analyzer = _CLNEW KeywordAnalyzer();
f26ef7a3a562dc42a1e9a4dde546bd30df3241e8Timo Sirainen _CLNEW snowball::SnowballAnalyzer(index->normalizer,
f26ef7a3a562dc42a1e9a4dde546bd30df3241e8Timo Sirainen index->default_analyzer = _CLNEW standard::StandardAnalyzer();
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainenvoid lucene_index_close(struct lucene_index *index)
e8d5561143360da75d5ccb4991c2d1ffb437be1dTimo Sirainen lucene_handle_error(index, err, "IndexWriter::close");
e8d5561143360da75d5ccb4991c2d1ffb437be1dTimo Sirainen lucene_handle_error(index, err, "IndexReader::close");
57f4445a46726a17bfe78b0964dd301a6ccb40ecTimo Sirainenvoid lucene_index_deinit(struct lucene_index *index)
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainen array_foreach_modifiable(&index->analyzers, a) {
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainen if (--textcat_refcount == 0 && textcat != NULL) {
2e5d624013c30633e8ded148d338ce46c321a995Timo Sirainenstatic void lucene_data_translate(struct lucene_index *index,
2e5d624013c30633e8ded148d338ce46c321a995Timo Sirainen const char *whitespace_chars = index->set.whitespace_chars;
2e5d624013c30633e8ded148d338ce46c321a995Timo Sirainen unsigned int i;
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen if (*whitespace_chars == '\0' || index->set.use_libfts)
2e5d624013c30633e8ded148d338ce46c321a995Timo Sirainen for (i = 0; i < len; i++) {
2e5d624013c30633e8ded148d338ce46c321a995Timo Sirainen if (strchr(whitespace_chars, data[i]) != NULL)
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainenvoid lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize,
9b6eb5a72ffe61579e24c6ae4c6ce3d4e9104b95Timo Sirainen i_assert(sizeof(wchar_t) == sizeof(unichar_t));
3281669db44d09a087a203201248abbc81b3cc1aTimo Sirainen buffer_create_from_data(&buf, dest, sizeof(wchar_t) * destsize);
9b6eb5a72ffe61579e24c6ae4c6ce3d4e9104b95Timo Sirainen array_create_from_buffer(&dest_arr, &buf, sizeof(wchar_t));
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen if (uni_utf8_to_ucs4_n(src, srcsize, &dest_arr) < 0)
9b6eb5a72ffe61579e24c6ae4c6ce3d4e9104b95Timo Sirainen i_assert(array_count(&dest_arr)+1 == destsize);
2e5d624013c30633e8ded148d338ce46c321a995Timo Sirainenstatic const wchar_t *
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainent_lucene_utf8_to_tchar(struct lucene_index *index, const char *str)
9b6eb5a72ffe61579e24c6ae4c6ce3d4e9104b95Timo Sirainen i_assert(sizeof(wchar_t) == sizeof(unichar_t));
2e5d624013c30633e8ded148d338ce46c321a995Timo Sirainen chars = array_get_modifiable(&dest_arr, &len);
62bf16bd8bb79e308e64110ae8d0b2a55a4c1490Timo Sirainenvoid lucene_index_select_mailbox(struct lucene_index *index,
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen MAILBOX_GUID_HEX_LENGTH * sizeof(wchar_t));
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen index->mailbox_guid[MAILBOX_GUID_HEX_LENGTH] = '\0';
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainenvoid lucene_index_unselect_mailbox(struct lucene_index *index)
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen memset(index->mailbox_guid, 0, sizeof(index->mailbox_guid));
031d075daf75b74b286711c1b6f64c3ae70e541bTimo Sirainenstatic void lucene_handle_error(struct lucene_index *index, CLuceneError &err,
031d075daf75b74b286711c1b6f64c3ae70e541bTimo Sirainen const char *msg)
03f4c5f3502801f5b318f464cc75313a88558805Timo Sirainen i_error("lucene index %s: %s failed (#%d): %s",
03f4c5f3502801f5b318f464cc75313a88558805Timo Sirainen /* delete corrupted index. most IO errors are also about
03f4c5f3502801f5b318f464cc75313a88558805Timo Sirainen missing files and other such corruption.. */
1d9053f57383a2382c70f76b0790a7bf192aa891Sergey Kitov if (unlink_directory(index->path, (enum unlink_directory_flags)0, &error) < 0)
97e511960951550338d69cac98fb5f3ca2badb09Timo Sirainen i_error("unlink_directory(%s) failed: %s", index->path, error);
57f4445a46726a17bfe78b0964dd301a6ccb40ecTimo Sirainenstatic int lucene_index_open(struct lucene_index *index)
57f4445a46726a17bfe78b0964dd301a6ccb40ecTimo Sirainen index->reader = IndexReader::open(index->path);
031d075daf75b74b286711c1b6f64c3ae70e541bTimo Sirainen lucene_handle_error(index, err, "IndexReader::open()");
4d72a99412a0577b026b64afc27975f28a58d071Timo Sirainen index->to_close = timeout_add(LUCENE_INDEX_CLOSE_TIMEOUT_MSECS,
57f4445a46726a17bfe78b0964dd301a6ccb40ecTimo Sirainenstatic int lucene_index_open_search(struct lucene_index *index)
57f4445a46726a17bfe78b0964dd301a6ccb40ecTimo Sirainen index->searcher = _CLNEW IndexSearcher(index->reader);
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainenlucene_doc_get_uid(struct lucene_index *index, Document *doc, uint32_t *uid_r)
9b6eb5a72ffe61579e24c6ae4c6ce3d4e9104b95Timo Sirainen const TCHAR *uid = field == NULL ? NULL : field->stringValue();
57f4445a46726a17bfe78b0964dd301a6ccb40ecTimo Sirainen i_error("lucene: Corrupted FTS index %s: No UID for document",
57f4445a46726a17bfe78b0964dd301a6ccb40ecTimo Sirainen while (*uid != 0) {
57b523eeb99ed5d7f5002907a409cdef54353ce5Timo Sirainenlucene_doc_get_part(struct lucene_index *index, Document *doc)
57b523eeb99ed5d7f5002907a409cdef54353ce5Timo Sirainen const TCHAR *part = field == NULL ? NULL : field->stringValue();
57b523eeb99ed5d7f5002907a409cdef54353ce5Timo Sirainen while (*part != 0) {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainenint lucene_index_get_last_uid(struct lucene_index *index, uint32_t *last_uid_r)
37f96554a5734557cd454691d163e602d36384b4Timo Sirainen if ((ret = lucene_index_open_search(index)) <= 0)
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen Term mailbox_term(_T("box"), index->mailbox_guid);
031d075daf75b74b286711c1b6f64c3ae70e541bTimo Sirainen lucene_handle_error(index, err, "last_uid search");
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainenint lucene_index_get_doc_count(struct lucene_index *index, uint32_t *count_r)
5b6470e0e2ef4012430cdeca7d9b89c1278a0ed4Timo Sirainenstatic int lucene_settings_check(struct lucene_index *index)
5b6470e0e2ef4012430cdeca7d9b89c1278a0ed4Timo Sirainen set_checksum = fts_lucene_settings_checksum(&index->set);
5b6470e0e2ef4012430cdeca7d9b89c1278a0ed4Timo Sirainen ret = fts_index_have_compatible_settings(index->list, set_checksum);
264629908d96285d355aac0acf9b60b9b4be7fefTimo Sirainen i_warning("fts-lucene: Settings have changed, rebuilding index for mailbox");
5b6470e0e2ef4012430cdeca7d9b89c1278a0ed4Timo Sirainen /* settings changed, rebuild index */
97e511960951550338d69cac98fb5f3ca2badb09Timo Sirainen if (unlink_directory(index->path, (enum unlink_directory_flags)0, &error) < 0) {
97e511960951550338d69cac98fb5f3ca2badb09Timo Sirainen i_error("unlink_directory(%s) failed: %s", index->path, error);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainenint lucene_index_build_init(struct lucene_index *index)
f94e1eb89b911e7bef709a25580590c3fff594acTimo Sirainen lock_path = t_strdup_printf("%s/write.lock", index->path);
f94e1eb89b911e7bef709a25580590c3fff594acTimo Sirainen st.st_mtime < time(NULL) - LUCENE_LOCK_OVERRIDE_SECS) {
57f4445a46726a17bfe78b0964dd301a6ccb40ecTimo Sirainen bool exists = IndexReader::indexExists(index->path);
57f4445a46726a17bfe78b0964dd301a6ccb40ecTimo Sirainen index->writer = _CLNEW IndexWriter(index->path,
031d075daf75b74b286711c1b6f64c3ae70e541bTimo Sirainen lucene_handle_error(index, err, "IndexWriter()");
33502e55a9bf4cafcd184ca9b114c126e420f856Timo Sirainen index->writer->setMaxFieldLength(MAX_TERMS_PER_DOCUMENT);
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainenstatic Analyzer *get_analyzer(struct lucene_index *index, const char *lang)
88b9f9eb91da632d3e941fe4276f8ace03205b25Timo Sirainen normalizer_func_t *normalizer = index->normalizer;
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainen const struct lucene_analyzer *a;
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainen memset(&new_analyzer, 0, sizeof(new_analyzer));
88b9f9eb91da632d3e941fe4276f8ace03205b25Timo Sirainen _CLNEW snowball::SnowballAnalyzer(normalizer, lang);
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainen array_append_i(&index->analyzers.arr, &new_analyzer, 1);
c37098f8ce6d512ba41f09564d04ed25720f0a77Timo Sirainenstatic void *textcat_init(struct lucene_index *index)
c37098f8ce6d512ba41f09564d04ed25720f0a77Timo Sirainen const char *textcat_dir = index->set.textcat_dir;
c37098f8ce6d512ba41f09564d04ed25720f0a77Timo Sirainen /* textcat really wants the '/' suffix */
c37098f8ce6d512ba41f09564d04ed25720f0a77Timo Sirainen textcat_dir = t_strconcat(textcat_dir, "/", NULL);
c37098f8ce6d512ba41f09564d04ed25720f0a77Timo Sirainen return special_textcat_Init(index->set.textcat_conf, textcat_dir);
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainenguess_analyzer(struct lucene_index *index, const void *data, size_t size)
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainen /* try to guess the language */
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainen lang = textcat_Classify(textcat, (const char *)data,
c37098f8ce6d512ba41f09564d04ed25720f0a77Timo Sirainen if (strcmp(lang, index->set.default_language) == 0)
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainenguess_analyzer(struct lucene_index *index ATTR_UNUSED,
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainen const void *data ATTR_UNUSED, size_t size ATTR_UNUSED)
57f4445a46726a17bfe78b0964dd301a6ccb40ecTimo Sirainenstatic int lucene_index_build_flush(struct lucene_index *index)
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen index->cur_analyzer : index->default_analyzer;
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen index->writer->addDocument(index->doc, analyzer);
031d075daf75b74b286711c1b6f64c3ae70e541bTimo Sirainen lucene_handle_error(index, err, "IndexWriter::addDocument()");
57f4445a46726a17bfe78b0964dd301a6ccb40ecTimo Sirainenint lucene_index_build_more(struct lucene_index *index, uint32_t uid,
57b523eeb99ed5d7f5002907a409cdef54353ce5Timo Sirainen if (uid != index->prev_uid || part_idx != index->prev_part_idx) {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen index->doc->add(*_CLNEW Field(_T("uid"), id, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
57b523eeb99ed5d7f5002907a409cdef54353ce5Timo Sirainen swprintf(id, N_ELEMENTS(id), L"%u", part_idx);
57b523eeb99ed5d7f5002907a409cdef54353ce5Timo Sirainen index->doc->add(*_CLNEW Field(_T("part"), id, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen index->doc->add(*_CLNEW Field(_T("box"), index->mailbox_guid, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen if (index->normalizer_buf != NULL && !index->set.use_libfts) {
f26ef7a3a562dc42a1e9a4dde546bd30df3241e8Timo Sirainen buffer_set_used_size(index->normalizer_buf, 0);
f26ef7a3a562dc42a1e9a4dde546bd30df3241e8Timo Sirainen index->normalizer(data, size, index->normalizer_buf);
f26ef7a3a562dc42a1e9a4dde546bd30df3241e8Timo Sirainen data = (const unsigned char *)index->normalizer_buf->data;
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen lucene_utf8_n_to_tchar(data, size, dest, datasize);
aadd92e0901d82d0a47aee76e7b6c9825523313bTimo Sirainen lucene_data_translate(index, dest, datasize-1);
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen Field::INDEX_UNTOKENIZED : Field::INDEX_TOKENIZED;
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen /* hdr_name should be ASCII, but don't break in case it isn't */
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen lucene_utf8_n_to_tchar((const unsigned char *)hdr_name,
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen index->doc->add(*_CLNEW Field(_T("hdr"), wname, Field::STORE_NO | token_flag));
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen index->doc->add(*_CLNEW Field(_T("hdr"), dest, Field::STORE_NO | token_flag));
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen index->doc->add(*_CLNEW Field(wname, dest, Field::STORE_NO | token_flag));
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen } else if (size > 0) {
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen if (index->cur_analyzer == NULL && !index->set.use_libfts)
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainen index->cur_analyzer = guess_analyzer(index, data, size);
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen index->doc->add(*_CLNEW Field(_T("body"), dest, Field::STORE_NO | token_flag));
57f4445a46726a17bfe78b0964dd301a6ccb40ecTimo Sirainenint lucene_index_build_deinit(struct lucene_index *index)
dce232dfbb2244555299dffb3618a4724748d260Timo Sirainen /* no changes. */
031d075daf75b74b286711c1b6f64c3ae70e541bTimo Sirainen lucene_handle_error(index, err, "IndexWriter::close()");
de62ce819d59a529530da4b57be1b8d6dad13d6bTimo Sirainenwcharguid_to_guid(guid_128_t dest, const wchar_t *src)
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen unsigned int i;
3281669db44d09a087a203201248abbc81b3cc1aTimo Sirainen buffer_create_from_data(&buf, dest, GUID_128_SIZE);
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainenrescan_get_uids(struct mailbox *box, ARRAY_TYPE(seq_range) *uids)
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen if (mailbox_get_status(box, STATUS_MESSAGES, &status) < 0)
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen seq_range_array_add_range(&seqs, 1, status.messages);
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainenstatic int rescan_finish(struct rescan_context *ctx)
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen ret = fts_index_set_last_uid(ctx->box, ctx->last_existing_uid);
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainenfts_lucene_get_mailbox_guid(struct lucene_index *index, Document *doc,
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen const TCHAR *box_guid = field == NULL ? NULL : field->stringValue();
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen i_error("lucene: Corrupted FTS index %s: No mailbox for document",
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainen if (wcharguid_to_guid(guid_r, box_guid) < 0) {
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainen "box field not in expected format", index->path);
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainenrescan_open_mailbox(struct rescan_context *ctx, Document *doc)
de62ce819d59a529530da4b57be1b8d6dad13d6bTimo Sirainen if (fts_lucene_get_mailbox_guid(ctx->index, doc, guid) < 0)
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen if (memcmp(guid, ctx->box_guid, sizeof(guid)) == 0) {
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen /* same as last one */
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen memcpy(ctx->box_guid, guid, sizeof(ctx->box_guid));
678d0463849ba777106eb7875f27db07a5d8e3dfTimo Sirainen hash_table_insert(ctx->seen_mailbox_guids, guidp, guidp);
4145cbac82bfc0c8bfeceeca0ef841700117930cTimo Sirainen ctx->box = mailbox_alloc_guid(ctx->index->list, guid,
bf7dc750b95039981c0e9d728f313d50cf38a156Martti Rannanjärvi errstr = mailbox_get_last_internal_error(ctx->box, &error);
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen i_error("lucene: Couldn't open mailbox %s: %s",
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen if (mailbox_sync(ctx->box, (enum mailbox_sync_flags)0) < 0) {
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen i_error("lucene: Failed to sync mailbox %s: %s",
bf7dc750b95039981c0e9d728f313d50cf38a156Martti Rannanjärvi mailbox_get_last_internal_error(ctx->box, NULL));
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen seq_range_array_iter_init(&ctx->uids_iter, &ctx->uids);
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainenrescan_next(struct rescan_context *ctx, Document *doc)
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainen if (lucene_doc_get_uid(ctx->index, doc, &lucene_uid) < 0)
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen if (seq_range_array_iter_nth(&ctx->uids_iter, ctx->uids_iter_n,
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen /* lucene is missing an UID from the middle. delete
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen the rest of the messages from this mailbox and
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen "missing UIDs in the middle",
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen /* UID has been expunged from index. delete from
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen lucene as well. */
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen /* the rest of the messages have been expunged from index */
0f801c1bd3d684c219d7f3b1e75f8b85f66f7951Timo Sirainenrescan_clear_unseen_mailbox(struct lucene_index *index,
af2564c7f9e05ad245a032efdfbc5abbb9b70f1eTimo Sirainen mailbox_get_metadata(box, MAILBOX_METADATA_GUID,
af2564c7f9e05ad245a032efdfbc5abbb9b70f1eTimo Sirainen hash_table_lookup(rescan_ctx->seen_mailbox_guids,
af2564c7f9e05ad245a032efdfbc5abbb9b70f1eTimo Sirainen /* this mailbox had no records in lucene index.
af2564c7f9e05ad245a032efdfbc5abbb9b70f1eTimo Sirainen make sure its last indexed uid is 0 */
5b6470e0e2ef4012430cdeca7d9b89c1278a0ed4Timo Sirainenstatic void rescan_clear_unseen_mailboxes(struct lucene_index *index,
3b55c4f0e2ab51e2607556dde8d36360296b29e4Timo Sirainen const enum mailbox_list_iter_flags iter_flags =
5b6470e0e2ef4012430cdeca7d9b89c1278a0ed4Timo Sirainen hdr.settings_checksum = fts_lucene_settings_checksum(&index->set);
5b6470e0e2ef4012430cdeca7d9b89c1278a0ed4Timo Sirainen iter = mailbox_list_iter_init(index->list, "*", iter_flags);
af2564c7f9e05ad245a032efdfbc5abbb9b70f1eTimo Sirainen while ((info = mailbox_list_iter_next(iter)) != NULL)
0f801c1bd3d684c219d7f3b1e75f8b85f66f7951Timo Sirainen rescan_clear_unseen_mailbox(index, rescan_ctx, info->vname, &hdr);
9a85ec311d3216dd6dbad401543330c4356d651dTimo Sirainen ns->prefix[ns->prefix_len-1] == mail_namespace_get_sep(ns)) {
9a85ec311d3216dd6dbad401543330c4356d651dTimo Sirainen /* namespace prefix itself isn't returned by the listing */
0f801c1bd3d684c219d7f3b1e75f8b85f66f7951Timo Sirainen rescan_clear_unseen_mailbox(index, rescan_ctx, vname, &hdr);
03f4c5f3502801f5b318f464cc75313a88558805Timo Sirainenint lucene_index_rescan(struct lucene_index *index)
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen static const TCHAR *sort_fields[] = { _T("box"), _T("uid"), NULL };
3b55c4f0e2ab51e2607556dde8d36360296b29e4Timo Sirainen if ((ret = lucene_index_open_search(index)) < 0)
3b55c4f0e2ab51e2607556dde8d36360296b29e4Timo Sirainen ctx.pool = pool_alloconly_create("guids", 1024);
678d0463849ba777106eb7875f27db07a5d8e3dfTimo Sirainen hash_table_create(&ctx.seen_mailbox_guids, ctx.pool, 0,
3b55c4f0e2ab51e2607556dde8d36360296b29e4Timo Sirainen if (ret > 0) try {
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen Hits *hits = index->searcher->search(&query, &sort);
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen ret = rescan_open_mailbox(&ctx, &hits->doc(i));
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen else if (ret == 0)
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainen lucene_handle_error(index, err, "rescan search");
de62ce819d59a529530da4b57be1b8d6dad13d6bTimo Sirainenstatic void guid128_to_wguid(const guid_128_t guid,
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainen wchar_t wguid_hex[MAILBOX_GUID_HEX_LENGTH + 1])
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainen unsigned char guid_hex[MAILBOX_GUID_HEX_LENGTH];
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainen unsigned int i;
3281669db44d09a087a203201248abbc81b3cc1aTimo Sirainen buffer_create_from_data(&buf, guid_hex, MAILBOX_GUID_HEX_LENGTH);
de62ce819d59a529530da4b57be1b8d6dad13d6bTimo Sirainen binary_to_hex_append(&buf, guid, GUID_128_SIZE);
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainen for (i = 0; i < MAILBOX_GUID_HEX_LENGTH; i++)
4ca9910d9db31a68d9d501150045cfd4bb1e2ac7Timo Sirainenlucene_index_add_uid_filter(BooleanQuery *query,
4ca9910d9db31a68d9d501150045cfd4bb1e2ac7Timo Sirainen const struct fts_expunge_log_read_record *rec)
4ca9910d9db31a68d9d501150045cfd4bb1e2ac7Timo Sirainen unsigned int n;
4ca9910d9db31a68d9d501150045cfd4bb1e2ac7Timo Sirainen /* RangeQuery and WildcardQuery work by enumerating through all terms
4ca9910d9db31a68d9d501150045cfd4bb1e2ac7Timo Sirainen that match them, and then adding TermQueries for them. So we can
4ca9910d9db31a68d9d501150045cfd4bb1e2ac7Timo Sirainen simply do the same directly, and if it looks like there are too
4ca9910d9db31a68d9d501150045cfd4bb1e2ac7Timo Sirainen many terms just go through everything. */
4ca9910d9db31a68d9d501150045cfd4bb1e2ac7Timo Sirainen if (seq_range_count(&rec->uids) > FTS_LUCENE_MAX_SEARCH_TERMS)
4ca9910d9db31a68d9d501150045cfd4bb1e2ac7Timo Sirainen return false;
4ca9910d9db31a68d9d501150045cfd4bb1e2ac7Timo Sirainen seq_range_array_iter_init(&iter, &rec->uids); n = 0;
4ca9910d9db31a68d9d501150045cfd4bb1e2ac7Timo Sirainen while (seq_range_array_iter_nth(&iter, n++, &uid)) {
1e0842a68211be1c619f082c64d259e9d5cc63aeTimo Sirainen query->add(_CLNEW TermQuery(term), true, BooleanClause::SHOULD);
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainenlucene_index_expunge_record(struct lucene_index *index,
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainen const struct fts_expunge_log_read_record *rec)
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainen if ((ret = lucene_index_open_search(index)) <= 0)
4ca9910d9db31a68d9d501150045cfd4bb1e2ac7Timo Sirainen if (lucene_index_add_uid_filter(&uids_query, rec))
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainen query.add(&mailbox_query, BooleanClause::MUST);
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainen lucene_handle_error(index, err, "expunge search");
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainenint lucene_index_expunge_from_log(struct lucene_index *index,
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainen const struct fts_expunge_log_read_record *rec;
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainen while ((rec = fts_expunge_log_read_next(ctx)) != NULL) {
027c729b3107441f54a2602ccf2c67c6206998d5Timo Sirainen if (lucene_index_expunge_record(index, rec) < 0) {
009d6d90b33bc7f64fa8251ac392cc87a835b833Timo Sirainenint lucene_index_optimize(struct lucene_index *index)
c8296ac1ed68ed5c5168de545b76f9b27fc76d35Timo Sirainen writer = _CLNEW IndexWriter(index->path, index->default_analyzer, false);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen lucene_handle_error(index, err, "IndexWriter::optimize()");
e8d5561143360da75d5ccb4991c2d1ffb437be1dTimo Sirainen lucene_handle_error(index, err, "IndexWriter::close()");
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen// Mostly copy&pasted from CLucene's QueryParser
0ae79eb8ff677a3ee757556b90073072d8972d5dTimo Sirainenstatic Query* getFieldQuery(Analyzer *analyzer, const TCHAR* _field, const TCHAR* queryText, bool fuzzy) {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen // Use the analyzer to get all the tokens, and then build a TermQuery,
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen // PhraseQuery, or nothing based on the term count
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen TokenStream* source = analyzer->tokenStream(_field, &reader);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen CLVector<CL_NS(analysis)::Token*, Deletor::Object<CL_NS(analysis)::Token> > v;
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen while (true) {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen }_CLCATCH_ERR(CL_ERR_IO, _CLLDELETE(source);_CLLDELETE(t);,{
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen _CLCATCH_ERR_CLEANUP(CL_ERR_IO, {_CLLDELETE(source);_CLLDELETE(t);} ); /* cleanup */
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen if (v.size() == 0)
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen Term* tm = _CLNEW Term(_field, v.at(0)->termBuffer());
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen // no phrase query:
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen Term* tm = _CLNEW Term(_field, v.at(i)->termBuffer());
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen q->add(_CLNEW TermQuery(tm), true, BooleanClause::SHOULD);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen MultiPhraseQuery* mpq = _CLNEW MultiPhraseQuery();
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen if (t->getPositionIncrement() > 0 && multiTerms.size() > 0) {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen ValueArray<Term*> termsArray(multiTerms.size());
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen multiTerms.push_back(_CLNEW Term(_field, t->termBuffer()));
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen ValueArray<Term*> termsArray(multiTerms.size());
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen Term* tm = _CLNEW Term(_field, t->termBuffer());
06ed0c58ea392df22ccf4868aac494831ea756e1Timo Sirainenlucene_get_query_str(struct lucene_index *index,
06ed0c58ea392df22ccf4868aac494831ea756e1Timo Sirainen const TCHAR *key, const char *str, bool fuzzy)
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen const wchar_t *wstr = t_lucene_utf8_to_tchar(index, str);
f26ef7a3a562dc42a1e9a4dde546bd30df3241e8Timo Sirainen buffer_set_used_size(index->normalizer_buf, 0);
f26ef7a3a562dc42a1e9a4dde546bd30df3241e8Timo Sirainen index->normalizer(str, strlen(str), index->normalizer_buf);
f26ef7a3a562dc42a1e9a4dde546bd30df3241e8Timo Sirainen str = (const char *)index->normalizer_buf->data;
f26ef7a3a562dc42a1e9a4dde546bd30df3241e8Timo Sirainen analyzer = guess_analyzer(index, str, strlen(str));
06ed0c58ea392df22ccf4868aac494831ea756e1Timo Sirainen return getFieldQuery(analyzer, key, wvalue, fuzzy);
06ed0c58ea392df22ccf4868aac494831ea756e1Timo Sirainen const TCHAR *key, const struct mail_search_arg *arg)
06ed0c58ea392df22ccf4868aac494831ea756e1Timo Sirainen return lucene_get_query_str(index, key, arg->value.str, arg->fuzzy);
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainenlucene_add_definite_query(struct lucene_index *index,
117fb8c00336dc54bab9cfa547249df7a4970611Timo Sirainen bool and_args = (flags & FTS_LOOKUP_FLAG_AND_ARGS) != 0;
cbc8f9d71483a2cf71610f7e7e1f2dc9884bd556Baofeng Wang return false;
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen /* FIXME: we could handle this by doing multiple queries.. */
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen return false;
0ae79eb8ff677a3ee757556b90073072d8972d5dTimo Sirainen Query *q1 = lucene_get_query(index, _T("hdr"), arg);
0ae79eb8ff677a3ee757556b90073072d8972d5dTimo Sirainen Query *q2 = lucene_get_query(index, _T("body"), arg);
65a67a3c17679d4bd800067ca6273c17e0ae4c62Timo Sirainen if (!fts_header_want_indexed(arg->hdr_field_name) ||
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen return false;
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen t_lucene_utf8_to_tchar(index, t_str_lcase(arg->hdr_field_name)),
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen return false;
61af1856d2a92cd2c66615a0fbc9ef371a8da8fcTimo Sirainen /* couldn't handle this search after all (e.g. trying to search
61af1856d2a92cd2c66615a0fbc9ef371a8da8fcTimo Sirainen a stop word) */
61af1856d2a92cd2c66615a0fbc9ef371a8da8fcTimo Sirainen return false;
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainen struct lucene_query *lq = array_append_space(&queries);
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainenlucene_add_maybe_query(struct lucene_index *index,
117fb8c00336dc54bab9cfa547249df7a4970611Timo Sirainen bool and_args = (flags & FTS_LOOKUP_FLAG_AND_ARGS) != 0;
cbc8f9d71483a2cf71610f7e7e1f2dc9884bd556Baofeng Wang return false;
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen /* FIXME: we could handle this by doing multiple queries.. */
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen return false;
4f4daf7df84f450c7342de569bf25195e93d6bc7Timo Sirainen if (*arg->value.str == '\0' && !index->set.use_libfts) {
65a67a3c17679d4bd800067ca6273c17e0ae4c62Timo Sirainen /* checking potential existence of the header name */
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen if (fts_header_want_indexed(arg->hdr_field_name))
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen return false;
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen /* we can check if the search key exists in some header and
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen filter out the messages that have no chance of matching */
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen return false;
dc03b7bb2cc2b78bf66856bdfedfb1cae774c43bTimo Sirainen /* couldn't handle this search after all (e.g. trying to search
dc03b7bb2cc2b78bf66856bdfedfb1cae774c43bTimo Sirainen a stop word) */
dc03b7bb2cc2b78bf66856bdfedfb1cae774c43bTimo Sirainen return false;
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainen struct lucene_query *lq = array_append_space(&queries);
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainenstatic bool queries_have_non_must_nots(ARRAY_TYPE(lucene_query) &queries)
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainenstatic void search_query_add(BooleanQuery &query,
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainen BooleanQuery *search_query = _CLNEW BooleanQuery();
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainen search_query->add(lq->query, true, lq->occur);
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainen query.add(search_query, true, BooleanClause::MUST);
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainen search_query->add(lq->query, true, BooleanClause::SHOULD);
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainen query.add(search_query, true, BooleanClause::MUST_NOT);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainenlucene_index_search(struct lucene_index *index,
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainen struct fts_result *result, ARRAY_TYPE(seq_range) *uids_r)
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen Term mailbox_term(_T("box"), index->mailbox_guid);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen query.add(&mailbox_query, BooleanClause::MUST);
49c848ccaab090b06add472122a1a7ebfaaf6044Timo Sirainen /* duplicate result */
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainenint lucene_index_lookup(struct lucene_index *index,
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen for (arg = args; arg != NULL; arg = arg->next) {
117fb8c00336dc54bab9cfa547249df7a4970611Timo Sirainen if (lucene_add_definite_query(index, def_queries, arg, flags)) {
117fb8c00336dc54bab9cfa547249df7a4970611Timo Sirainen (flags & FTS_LOOKUP_FLAG_NO_AUTO_FUZZY) == 0 ?
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainen if (lucene_index_search(index, def_queries, result,
acc72c40c5bfe818013e0ae9c9e73eb90ae8fbb1Timo Sirainen /* FIXME: mixing up definite + maybe queries is broken. if the
acc72c40c5bfe818013e0ae9c9e73eb90ae8fbb1Timo Sirainen definite query matched, it'll just assume that the maybe
acc72c40c5bfe818013e0ae9c9e73eb90ae8fbb1Timo Sirainen queries matched as well */
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen for (arg = args; arg != NULL; arg = arg->next) {
117fb8c00336dc54bab9cfa547249df7a4970611Timo Sirainen if (lucene_add_maybe_query(index, maybe_queries, arg, flags)) {
bd49b2e35dfa08753e89bef12a694978599d0fc0Timo Sirainen if (lucene_index_search(index, maybe_queries, NULL,
678d0463849ba777106eb7875f27db07a5d8e3dfTimo Sirainenlucene_index_search_multi(struct lucene_index *index,
a75d470c9223a75801418fcdda258885c36317e0Timo Sirainen while (hash_table_iterate(iter, guids, &key, &value)) {
a7b0916217f8ebb1da55e049f054e047f81bb911Timo Sirainen Term *term = _CLNEW Term(_T("box"), (wchar_t *)key);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen mailbox_query.add(q, true, BooleanClause::SHOULD);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen query.add(&mailbox_query, BooleanClause::MUST);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen Field *field = hits->doc(i).getField(_T("box"));
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen const TCHAR *box_guid = field == NULL ? NULL : field->stringValue();
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen i_error("lucene: Corrupted FTS index %s: No mailbox for document",
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen i_warning("lucene: Returned unexpected mailbox with GUID %ls", box_guid);
117fb8c00336dc54bab9cfa547249df7a4970611Timo Sirainen (flags & FTS_LOOKUP_FLAG_NO_AUTO_FUZZY) == 0 ?
49c848ccaab090b06add472122a1a7ebfaaf6044Timo Sirainen /* duplicate result */
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen lucene_handle_error(index, err, "multi search");
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainenint lucene_index_lookup_multi(struct lucene_index *index,
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen for (arg = args; arg != NULL; arg = arg->next) {
117fb8c00336dc54bab9cfa547249df7a4970611Timo Sirainen if (lucene_add_definite_query(index, def_queries, arg, flags)) {
117fb8c00336dc54bab9cfa547249df7a4970611Timo Sirainen if (lucene_index_search_multi(index, guids, def_queries, flags,
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainenlucene_index_iter_init(struct lucene_index *index)
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainen static const TCHAR *sort_fields[] = { _T("box"), _T("uid"), NULL };
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainen if ((ret = lucene_index_open_search(index)) <= 0) {
def291c1ccc82f439541ea7b49652a1466a999fcTimo Sirainen iter->query = _CLNEW WildcardQuery(iter->term);
def291c1ccc82f439541ea7b49652a1466a999fcTimo Sirainen iter->hits = index->searcher->search(iter->query, iter->sort);
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainen lucene_handle_error(index, err, "rescan search");
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainenlucene_index_iter_next(struct lucene_index_iter *iter)
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainen (void)fts_lucene_get_mailbox_guid(iter->index, doc,
39ed514f9d401b3cb589595c6a2f532050254d77Timo Sirainen (void)lucene_doc_get_uid(iter->index, doc, &iter->rec.uid);
57b523eeb99ed5d7f5002907a409cdef54353ce5Timo Sirainen iter->rec.part_num = lucene_doc_get_part(iter->index, doc);