lucene-wrapper.cc revision b07cb607ae8864e7784787a9db96880db1f50e52
baf6671fd3847865da9b64dc8f8be85d81304840Daniel Lezcano/* Copyright (c) 2006-2010 Dovecot authors, see the included COPYING file */
baf6671fd3847865da9b64dc8f8be85d81304840Daniel Lezcano
892bd61e0fcc788390abfadd32b1897943290879dlezcanoextern "C" {
5c320b769229d713e84b02ed6b7ae1309ac31dbbGuido Trotter#include "lib.h"
c13c0e08da7dbfecb52e85233ac6cd17afa5d818Stéphane Graber#include "array.h"
892bd61e0fcc788390abfadd32b1897943290879dlezcano#include "unichar.h"
d1de19abd0067f38bc08a4a3357de701a4e5571ddlezcano#include "hash.h"
d1de19abd0067f38bc08a4a3357de701a4e5571ddlezcano#include "hex-binary.h"
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber#include "unlink-directory.h"
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber#include "mail-index.h"
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber#include "mail-search.h"
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber#include "mail-namespace.h"
11cddd70eb8c285287b73562ba4208d74e1b9fdeSerge Hallyn#include "mailbox-list-private.h"
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber#include "mail-storage.h"
d1de19abd0067f38bc08a4a3357de701a4e5571ddlezcano#include "fts-expunge-log.h"
d1de19abd0067f38bc08a4a3357de701a4e5571ddlezcano#include "fts-lucene-plugin.h"
d1de19abd0067f38bc08a4a3357de701a4e5571ddlezcano#include "lucene-wrapper.h"
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber#include <sys/stat.h>
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber#ifdef HAVE_LUCENE_TEXTCAT
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber# include <libtextcat/textcat.h>
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber#else
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber#ifdef HAVE_LUCENE_EXTTEXTCAT
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber# include <libexttextcat/textcat.h>
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber#endif
d1de19abd0067f38bc08a4a3357de701a4e5571ddlezcano#endif
d1de19abd0067f38bc08a4a3357de701a4e5571ddlezcano};
d1de19abd0067f38bc08a4a3357de701a4e5571ddlezcano#include <CLucene.h>
810567bbbe283c547e4ac837545d1e592916df26Serge Hallyn#include <CLucene/util/CLStreams.h>
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber#include <CLucene/search/MultiPhraseQuery.h>
d1de19abd0067f38bc08a4a3357de701a4e5571ddlezcano#include "SnowballAnalyzer.h"
8a67a2b2eaf28033962a432c214bd3303c29c54cdlezcano
8b8b04f80adf21480c25deb1aae263049ddd6754dlezcano/* Lucene's default is 10000. Use it here also.. */
8b8b04f80adf21480c25deb1aae263049ddd6754dlezcano#define MAX_TERMS_PER_DOCUMENT 10000
8b8b04f80adf21480c25deb1aae263049ddd6754dlezcano#define FTS_LUCENE_MAX_SEARCH_TERMS 1000
d1de19abd0067f38bc08a4a3357de701a4e5571ddlezcano
6a85cf91247b7dd9c3faeddceca8dacb96d02cd6Stéphane Graber#define LUCENE_LOCK_OVERRIDE_SECS 60
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graber
6a85cf91247b7dd9c3faeddceca8dacb96d02cd6Stéphane Graberusing namespace lucene::document;
4019712d198a7d50b08b326ade17f5ff1666efbbStéphane Graberusing namespace lucene::index;
6a85cf91247b7dd9c3faeddceca8dacb96d02cd6Stéphane Graberusing namespace lucene::search;
6a85cf91247b7dd9c3faeddceca8dacb96d02cd6Stéphane Graberusing namespace lucene::queryParser;
6a85cf91247b7dd9c3faeddceca8dacb96d02cd6Stéphane Graberusing namespace lucene::analysis;
d1de19abd0067f38bc08a4a3357de701a4e5571ddlezcanousing namespace lucene::analysis;
f080ffd7d656fbd9505a8e8eb52a05d61355c677Dwight Engenusing namespace lucene::util;
f080ffd7d656fbd9505a8e8eb52a05d61355c677Dwight Engen
f080ffd7d656fbd9505a8e8eb52a05d61355c677Dwight Engenstruct lucene_query {
f080ffd7d656fbd9505a8e8eb52a05d61355c677Dwight Engen Query *query;
aa8d013ec5b09cd1cd904173d6234ef126eb2126Peter Simons BooleanClause::Occur occur;
7822022c4c72cee06905b540b89b653491d6f6b2Stéphane Graber};
6a85cf91247b7dd9c3faeddceca8dacb96d02cd6Stéphane GraberARRAY_DEFINE_TYPE(lucene_query, struct lucene_query);
8a67a2b2eaf28033962a432c214bd3303c29c54cdlezcano
aa8d013ec5b09cd1cd904173d6234ef126eb2126Peter Simonsstruct lucene_analyzer {
7822022c4c72cee06905b540b89b653491d6f6b2Stéphane Graber char *lang;
6a85cf91247b7dd9c3faeddceca8dacb96d02cd6Stéphane Graber Analyzer *analyzer;
8a67a2b2eaf28033962a432c214bd3303c29c54cdlezcano};
aa8d013ec5b09cd1cd904173d6234ef126eb2126Peter Simons
7822022c4c72cee06905b540b89b653491d6f6b2Stéphane Graberstruct lucene_index {
6a85cf91247b7dd9c3faeddceca8dacb96d02cd6Stéphane Graber char *path;
99e4008cad9e959b683c6f48411fcf15a92be3b5Michel Normand struct mailbox_list *list;
10fba81b9d0221b8e47aa1e0b43236413b7d28dfMichel Normand struct fts_lucene_settings set;
8b8b04f80adf21480c25deb1aae263049ddd6754dlezcano normalizer_func_t *normalizer;
9a42db48e0bcf4f34b05a3de1cda23e06f51d131Stéphane Graber
3b9246c4aae3f7602c0ad64f5b1204eb559e5b07Daniel Lezcano wchar_t mailbox_guid[MAILBOX_GUID_HEX_LENGTH + 1];
892bd61e0fcc788390abfadd32b1897943290879dlezcano
IndexReader *reader;
IndexWriter *writer;
IndexSearcher *searcher;
Analyzer *default_analyzer, *cur_analyzer;
ARRAY(struct lucene_analyzer) analyzers;
Document *doc;
uint32_t prev_uid;
};
struct rescan_context {
struct lucene_index *index;
struct mailbox *box;
guid_128_t box_guid;
int box_ret;
pool_t pool;
HASH_TABLE(uint8_t *, uint8_t *) seen_mailbox_guids;
ARRAY_TYPE(seq_range) uids;
struct seq_range_iter uids_iter;
unsigned int uids_iter_n;
uint32_t last_existing_uid;
bool warned;
};
static void *textcat = NULL;
static bool textcat_broken = FALSE;
static int textcat_refcount = 0;
static void rescan_clear_unseen_mailboxes(struct lucene_index *index,
struct rescan_context *rescan_ctx);
struct lucene_index *lucene_index_init(const char *path,
struct mailbox_list *list,
const struct fts_lucene_settings *set)
{
struct lucene_index *index;
unsigned int len;
index = i_new(struct lucene_index, 1);
index->path = i_strdup(path);
index->list = list;
index->normalizer = !set->normalize ? NULL :
mailbox_list_get_namespace(list)->user->default_normalizer;
if (set != NULL)
index->set = *set;
else {
/* this is valid only for doveadm dump, so it doesn't matter */
index->set.default_language = "";
}
#ifdef HAVE_LUCENE_STEMMER
index->default_analyzer =
_CLNEW snowball::SnowballAnalyzer(index->normalizer,
index->set.default_language);
#else
index->default_analyzer = _CLNEW standard::StandardAnalyzer();
i_assert(index->normalizer == NULL);
#endif
i_array_init(&index->analyzers, 32);
textcat_refcount++;
return index;
}
void lucene_index_close(struct lucene_index *index)
{
_CLDELETE(index->reader);
_CLDELETE(index->writer);
_CLDELETE(index->searcher);
}
void lucene_index_deinit(struct lucene_index *index)
{
struct lucene_analyzer *a;
lucene_index_close(index);
array_foreach_modifiable(&index->analyzers, a) {
i_free(a->lang);
_CLDELETE(a->analyzer);
}
array_free(&index->analyzers);
if (--textcat_refcount == 0 && textcat != NULL) {
#ifdef HAVE_LUCENE_TEXTCAT
textcat_Done(textcat);
#endif
textcat = NULL;
}
_CLDELETE(index->default_analyzer);
i_free(index->path);
i_free(index);
}
static void lucene_data_translate(struct lucene_index *index,
wchar_t *data, unsigned int len)
{
const char *whitespace_chars = index->set.whitespace_chars;
unsigned int i;
if (*whitespace_chars == '\0')
return;
for (i = 0; i < len; i++) {
if (strchr(whitespace_chars, data[i]) != NULL)
data[i] = ' ';
}
}
void lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize,
wchar_t *dest, size_t destsize)
{
ARRAY_TYPE(unichars) dest_arr;
buffer_t buf = { 0, 0, { 0, 0, 0, 0, 0 } };
i_assert(sizeof(wchar_t) == sizeof(unichar_t));
buffer_create_from_data(&buf, dest, sizeof(wchar_t) * destsize);
array_create_from_buffer(&dest_arr, &buf, sizeof(wchar_t));
if (uni_utf8_to_ucs4_n(src, srcsize, &dest_arr) < 0)
i_unreached();
i_assert(array_count(&dest_arr)+1 == destsize);
dest[destsize-1] = 0;
}
static const wchar_t *
t_lucene_utf8_to_tchar(struct lucene_index *index,
const char *str, bool translate)
{
ARRAY_TYPE(unichars) dest_arr;
const unichar_t *chars;
wchar_t *ret;
unsigned int len;
i_assert(sizeof(wchar_t) == sizeof(unichar_t));
t_array_init(&dest_arr, strlen(str) + 1);
if (uni_utf8_to_ucs4(str, &dest_arr) < 0)
i_unreached();
(void)array_append_space(&dest_arr);
chars = array_get_modifiable(&dest_arr, &len);
ret = (wchar_t *)chars;
lucene_data_translate(index, ret, len - 1);
return ret;
}
void lucene_index_select_mailbox(struct lucene_index *index,
const wchar_t guid[MAILBOX_GUID_HEX_LENGTH])
{
memcpy(index->mailbox_guid, guid,
MAILBOX_GUID_HEX_LENGTH * sizeof(wchar_t));
index->mailbox_guid[MAILBOX_GUID_HEX_LENGTH] = '\0';
}
void lucene_index_unselect_mailbox(struct lucene_index *index)
{
memset(index->mailbox_guid, 0, sizeof(index->mailbox_guid));
}
static void lucene_handle_error(struct lucene_index *index, CLuceneError &err,
const char *msg)
{
const char *what = err.what();
i_error("lucene index %s: %s failed (#%d): %s",
index->path, msg, err.number(), what);
if (index->list != NULL &&
(err.number() == CL_ERR_CorruptIndex ||
err.number() == CL_ERR_IO)) {
/* delete corrupted index. most IO errors are also about
missing files and other such corruption.. */
if (unlink_directory(index->path,
UNLINK_DIRECTORY_FLAG_RMDIR) < 0 &&
errno != ENOENT)
i_error("unlink_directory(%s) failed: %m", index->path);
rescan_clear_unseen_mailboxes(index, NULL);
}
}
static int lucene_index_open(struct lucene_index *index)
{
if (index->reader != NULL)
return 1;
if (!IndexReader::indexExists(index->path))
return 0;
try {
index->reader = IndexReader::open(index->path);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "IndexReader::open()");
return -1;
}
return 1;
}
static int lucene_index_open_search(struct lucene_index *index)
{
int ret;
if (index->searcher != NULL)
return 1;
if ((ret = lucene_index_open(index)) <= 0)
return ret;
index->searcher = _CLNEW IndexSearcher(index->reader);
return 1;
}
static int
lucene_doc_get_uid(struct lucene_index *index, Document *doc, uint32_t *uid_r)
{
Field *field = doc->getField(_T("uid"));
const TCHAR *uid = field == NULL ? NULL : field->stringValue();
if (uid == NULL) {
i_error("lucene: Corrupted FTS index %s: No UID for document",
index->path);
return -1;
}
uint32_t num = 0;
while (*uid != 0) {
num = num*10 + (*uid - '0');
uid++;
}
*uid_r = num;
return 0;
}
int lucene_index_get_last_uid(struct lucene_index *index, uint32_t *last_uid_r)
{
int ret = 0;
*last_uid_r = 0;
if ((ret = lucene_index_open_search(index)) <= 0)
return ret;
Term mailbox_term(_T("box"), index->mailbox_guid);
TermQuery query(&mailbox_term);
uint32_t last_uid = 0;
try {
Hits *hits = index->searcher->search(&query);
for (size_t i = 0; i < hits->length(); i++) {
uint32_t uid;
if (lucene_doc_get_uid(index, &hits->doc(i),
&uid) < 0) {
ret = -1;
break;
}
if (uid > last_uid)
last_uid = uid;
}
_CLDELETE(hits);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "last_uid search");
ret = -1;
}
*last_uid_r = last_uid;
return ret;
}
int lucene_index_get_doc_count(struct lucene_index *index, uint32_t *count_r)
{
int ret;
if (index->reader == NULL) {
lucene_index_close(index);
if ((ret = lucene_index_open(index)) < 0)
return -1;
if (ret == 0) {
*count_r = 0;
return 0;
}
}
*count_r = index->reader->numDocs();
return 0;
}
static int lucene_settings_check(struct lucene_index *index)
{
struct fts_index_header hdr;
uint32_t set_checksum;
int ret = 0;
set_checksum = fts_lucene_settings_checksum(&index->set);
ret = fts_index_have_compatible_settings(index->list, set_checksum);
if (ret != 0)
return ret;
/* settings changed, rebuild index */
if (unlink_directory(index->path, UNLINK_DIRECTORY_FLAG_RMDIR) < 0) {
i_error("unlink_directory(%s) failed: %m", index->path);
ret = -1;
} else {
rescan_clear_unseen_mailboxes(index, NULL);
}
return ret;
}
int lucene_index_build_init(struct lucene_index *index)
{
const char *lock_path;
struct stat st;
lucene_index_close(index);
lock_path = t_strdup_printf("%s/write.lock", index->path);
if (stat(lock_path, &st) == 0 &&
st.st_mtime < time(NULL) - LUCENE_LOCK_OVERRIDE_SECS) {
if (unlink(lock_path) < 0)
i_error("unlink(%s) failed: %m", lock_path);
}
if (lucene_settings_check(index) < 0)
return -1;
bool exists = IndexReader::indexExists(index->path);
try {
index->writer = _CLNEW IndexWriter(index->path,
index->default_analyzer,
!exists);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "IndexWriter()");
return -1;
}
index->writer->setMaxFieldLength(MAX_TERMS_PER_DOCUMENT);
return 0;
}
#ifdef HAVE_LUCENE_TEXTCAT
static Analyzer *get_analyzer(struct lucene_index *index, const char *lang)
{
normalizer_func_t *normalizer = index->normalizer;
const struct lucene_analyzer *a;
struct lucene_analyzer new_analyzer;
Analyzer *analyzer;
array_foreach(&index->analyzers, a) {
if (strcmp(a->lang, lang) == 0)
return a->analyzer;
}
memset(&new_analyzer, 0, sizeof(new_analyzer));
new_analyzer.lang = i_strdup(lang);
new_analyzer.analyzer =
_CLNEW snowball::SnowballAnalyzer(normalizer, lang);
array_append_i(&index->analyzers.arr, &new_analyzer, 1);
return new_analyzer.analyzer;
}
static void *textcat_init(struct lucene_index *index)
{
const char *textcat_dir = index->set.textcat_dir;
unsigned int len;
if (textcat_dir == NULL)
return NULL;
/* textcat really wants the '/' suffix */
len = strlen(textcat_dir);
if (len > 0 && textcat_dir[len-1] != '/')
textcat_dir = t_strconcat(textcat_dir, "/", NULL);
return special_textcat_Init(index->set.textcat_conf, textcat_dir);
}
static Analyzer *
guess_analyzer(struct lucene_index *index, const void *data, size_t size)
{
const char *lang;
if (textcat_broken)
return NULL;
if (textcat == NULL) {
textcat = textcat_init(index);
if (textcat == NULL) {
textcat_broken = TRUE;
return NULL;
}
}
/* try to guess the language */
lang = textcat_Classify(textcat, (const char *)data,
I_MIN(size, 500));
const char *p = strchr(lang, ']');
if (lang[0] != '[' || p == NULL)
return NULL;
lang = t_strdup_until(lang+1, p);
if (strcmp(lang, index->set.default_language) == 0)
return index->default_analyzer;
return get_analyzer(index, lang);
}
#else
static Analyzer *
guess_analyzer(struct lucene_index *index ATTR_UNUSED,
const void *data ATTR_UNUSED, size_t size ATTR_UNUSED)
{
return NULL;
}
#endif
static int lucene_index_build_flush(struct lucene_index *index)
{
int ret = 0;
if (index->doc == NULL)
return 0;
try {
index->writer->addDocument(index->doc,
index->cur_analyzer != NULL ?
index->cur_analyzer :
index->default_analyzer);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "IndexWriter::addDocument()");
ret = -1;
}
_CLDELETE(index->doc);
index->doc = NULL;
index->cur_analyzer = NULL;
return ret;
}
int lucene_index_build_more(struct lucene_index *index, uint32_t uid,
const unsigned char *data, size_t size,
const char *hdr_name)
{
wchar_t id[MAX_INT_STRLEN];
size_t namesize, datasize;
if (uid != index->prev_uid) {
if (lucene_index_build_flush(index) < 0)
return -1;
index->prev_uid = uid;
index->doc = _CLNEW Document();
swprintf(id, N_ELEMENTS(id), L"%u", uid);
index->doc->add(*_CLNEW Field(_T("uid"), id, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
index->doc->add(*_CLNEW Field(_T("box"), index->mailbox_guid, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
}
datasize = uni_utf8_strlen_n(data, size) + 1;
wchar_t dest[datasize];
lucene_utf8_n_to_tchar(data, size, dest, datasize);
lucene_data_translate(index, dest, datasize);
if (hdr_name != NULL) {
/* hdr_name should be ASCII, but don't break in case it isn't */
hdr_name = t_str_lcase(hdr_name);
namesize = uni_utf8_strlen(hdr_name) + 1;
wchar_t wname[namesize];
lucene_utf8_n_to_tchar((const unsigned char *)hdr_name,
strlen(hdr_name), wname, namesize);
index->doc->add(*_CLNEW Field(_T("hdr"), wname, Field::STORE_NO | Field::INDEX_TOKENIZED));
index->doc->add(*_CLNEW Field(_T("hdr"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
if (fts_header_want_indexed(hdr_name))
index->doc->add(*_CLNEW Field(wname, dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
} else if (size > 0) {
if (index->cur_analyzer == NULL)
index->cur_analyzer = guess_analyzer(index, data, size);
index->doc->add(*_CLNEW Field(_T("body"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
}
return 0;
}
int lucene_index_build_deinit(struct lucene_index *index)
{
int ret = 0;
if (index->prev_uid == 0) {
/* no changes. */
return 0;
}
index->prev_uid = 0;
if (index->writer == NULL) {
lucene_index_close(index);
return -1;
}
if (lucene_index_build_flush(index) < 0)
ret = -1;
try {
index->writer->close();
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "IndexWriter::close()");
ret = -1;
}
lucene_index_close(index);
return ret;
}
static int
wcharguid_to_guid(guid_128_t dest, const wchar_t *src)
{
buffer_t buf = { 0, 0, { 0, 0, 0, 0, 0 } };
char src_chars[GUID_128_SIZE*2 + 1];
unsigned int i;
for (i = 0; i < sizeof(src_chars)-1; i++) {
if ((src[i] >= '0' && src[i] <= '9') ||
(src[i] >= 'a' && src[i] <= 'f'))
src_chars[i] = src[i];
else
return -1;
}
if (src[i] != '\0')
return -1;
src_chars[i] = '\0';
buffer_create_from_data(&buf, dest, GUID_128_SIZE);
return hex_to_binary(src_chars, &buf);
}
static int
rescan_get_uids(struct mailbox *box, ARRAY_TYPE(seq_range) *uids)
{
struct mailbox_status status;
if (mailbox_get_status(box, STATUS_MESSAGES, &status) < 0)
return -1;
if (status.messages > 0) T_BEGIN {
ARRAY_TYPE(seq_range) seqs;
t_array_init(&seqs, 2);
seq_range_array_add_range(&seqs, 1, status.messages);
mailbox_get_uid_range(box, &seqs, uids);
} T_END;
return 0;
}
static int rescan_finish(struct rescan_context *ctx)
{
int ret;
ret = fts_index_set_last_uid(ctx->box, ctx->last_existing_uid);
mailbox_free(&ctx->box);
return ret;
}
static int
fts_lucene_get_mailbox_guid(struct lucene_index *index, Document *doc,
guid_128_t guid_r)
{
Field *field = doc->getField(_T("box"));
const TCHAR *box_guid = field == NULL ? NULL : field->stringValue();
if (box_guid == NULL) {
i_error("lucene: Corrupted FTS index %s: No mailbox for document",
index->path);
return -1;
}
if (wcharguid_to_guid(guid_r, box_guid) < 0) {
i_error("lucene: Corrupted FTS index %s: "
"box field not in expected format", index->path);
return -1;
}
return 0;
}
static int
rescan_open_mailbox(struct rescan_context *ctx, Document *doc)
{
guid_128_t guid, *guidp;
int ret;
if (fts_lucene_get_mailbox_guid(ctx->index, doc, guid) < 0)
return 0;
if (memcmp(guid, ctx->box_guid, sizeof(guid)) == 0) {
/* same as last one */
return ctx->box_ret;
}
memcpy(ctx->box_guid, guid, sizeof(ctx->box_guid));
guidp = p_new(ctx->pool, guid_128_t, 1);
memcpy(guidp, guid, sizeof(*guidp));
hash_table_insert(ctx->seen_mailbox_guids, guidp, guidp);
if (ctx->box != NULL)
rescan_finish(ctx);
ctx->box = mailbox_alloc_guid(ctx->index->list, guid,
(enum mailbox_flags)0);
if (mailbox_open(ctx->box) < 0) {
enum mail_error error;
const char *errstr;
errstr = mailbox_get_last_error(ctx->box, &error);
if (error == MAIL_ERROR_NOTFOUND)
ret = 0;
else {
i_error("lucene: Couldn't open mailbox %s: %s",
mailbox_get_vname(ctx->box), errstr);
ret = -1;
}
mailbox_free(&ctx->box);
ctx->box_ret = ret;
return ret;
}
if (mailbox_sync(ctx->box, (enum mailbox_sync_flags)0) < 0) {
i_error("lucene: Failed to sync mailbox %s: %s",
mailbox_get_vname(ctx->box),
mailbox_get_last_error(ctx->box, NULL));
mailbox_free(&ctx->box);
ctx->box_ret = -1;
return -1;
}
array_clear(&ctx->uids);
rescan_get_uids(ctx->box, &ctx->uids);
ctx->warned = FALSE;
ctx->last_existing_uid = 0;
ctx->uids_iter_n = 0;
seq_range_array_iter_init(&ctx->uids_iter, &ctx->uids);
ctx->box_ret = 1;
return 1;
}
static int
rescan_next(struct rescan_context *ctx, Document *doc)
{
uint32_t lucene_uid, idx_uid;
if (lucene_doc_get_uid(ctx->index, doc, &lucene_uid) < 0)
return 0;
if (seq_range_array_iter_nth(&ctx->uids_iter, ctx->uids_iter_n,
&idx_uid)) {
if (idx_uid == lucene_uid) {
ctx->uids_iter_n++;
ctx->last_existing_uid = idx_uid;
return 1;
}
if (idx_uid < lucene_uid) {
/* lucene is missing an UID from the middle. delete
the rest of the messages from this mailbox and
reindex. */
if (!ctx->warned) {
i_warning("lucene: Mailbox %s "
"missing UIDs in the middle",
mailbox_get_vname(ctx->box));
ctx->warned = TRUE;
}
} else {
/* UID has been expunged from index. delete from
lucene as well. */
}
return 0;
} else {
/* the rest of the messages have been expunged from index */
return 0;
}
}
static void rescan_clear_unseen_mailboxes(struct lucene_index *index,
struct rescan_context *rescan_ctx)
{
const enum mailbox_list_iter_flags iter_flags =
(enum mailbox_list_iter_flags)
(MAILBOX_LIST_ITER_NO_AUTO_BOXES |
MAILBOX_LIST_ITER_RETURN_NO_FLAGS);
struct mailbox_list_iterate_context *iter;
const struct mailbox_info *info;
struct mailbox *box;
struct mailbox_metadata metadata;
struct fts_index_header hdr;
memset(&hdr, 0, sizeof(hdr));
hdr.settings_checksum = fts_lucene_settings_checksum(&index->set);
iter = mailbox_list_iter_init(index->list, "*", iter_flags);
while ((info = mailbox_list_iter_next(iter)) != NULL) {
box = mailbox_alloc(index->list, info->vname,
(enum mailbox_flags)0);
if (mailbox_get_metadata(box, MAILBOX_METADATA_GUID,
&metadata) == 0 &&
(rescan_ctx == NULL ||
hash_table_lookup(rescan_ctx->seen_mailbox_guids,
metadata.guid) == NULL)) {
/* this mailbox had no records in lucene index.
make sure its last indexed uid is 0 */
(void)fts_index_set_header(box, &hdr);
}
mailbox_free(&box);
}
(void)mailbox_list_iter_deinit(&iter);
}
int lucene_index_rescan(struct lucene_index *index)
{
static const TCHAR *sort_fields[] = { _T("box"), _T("uid"), NULL };
struct rescan_context ctx;
guid_128_t guid;
bool failed = false;
int ret;
i_assert(index->list != NULL);
if ((ret = lucene_index_open_search(index)) < 0)
return ret;
Term term(_T("box"), _T("*"));
WildcardQuery query(&term);
Sort sort(sort_fields);
memset(&ctx, 0, sizeof(ctx));
ctx.index = index;
ctx.pool = pool_alloconly_create("guids", 1024);
hash_table_create(&ctx.seen_mailbox_guids, ctx.pool, 0,
guid_128_hash, guid_128_cmp);
i_array_init(&ctx.uids, 128);
if (ret > 0) try {
Hits *hits = index->searcher->search(&query, &sort);
for (size_t i = 0; i < hits->length(); i++) {
ret = rescan_open_mailbox(&ctx, &hits->doc(i));
if (ret > 0)
ret = rescan_next(&ctx, &hits->doc(i));
if (ret < 0)
failed = true;
else if (ret == 0)
index->reader->deleteDocument(hits->id(i));
}
_CLDELETE(hits);
index->reader->close();
lucene_index_close(index);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "rescan search");
failed = true;
}
if (ctx.box != NULL)
rescan_finish(&ctx);
array_free(&ctx.uids);
rescan_clear_unseen_mailboxes(index, &ctx);
hash_table_destroy(&ctx.seen_mailbox_guids);
pool_unref(&ctx.pool);
return failed ? -1 : 0;
}
static void guid128_to_wguid(const guid_128_t guid,
wchar_t wguid_hex[MAILBOX_GUID_HEX_LENGTH + 1])
{
buffer_t buf = { 0, 0, { 0, 0, 0, 0, 0 } };
unsigned char guid_hex[MAILBOX_GUID_HEX_LENGTH];
unsigned int i;
buffer_create_from_data(&buf, guid_hex, MAILBOX_GUID_HEX_LENGTH);
binary_to_hex_append(&buf, guid, GUID_128_SIZE);
for (i = 0; i < MAILBOX_GUID_HEX_LENGTH; i++)
wguid_hex[i] = guid_hex[i];
wguid_hex[i] = '\0';
}
static bool
lucene_index_add_uid_filter(BooleanQuery *query,
const struct fts_expunge_log_read_record *rec)
{
struct seq_range_iter iter;
wchar_t wuid[MAX_INT_STRLEN];
unsigned int n;
uint32_t uid;
/* RangeQuery and WildcardQuery work by enumerating through all terms
that match them, and then adding TermQueries for them. So we can
simply do the same directly, and if it looks like there are too
many terms just go through everything. */
if (seq_range_count(&rec->uids) > FTS_LUCENE_MAX_SEARCH_TERMS)
return false;
seq_range_array_iter_init(&iter, &rec->uids); n = 0;
while (seq_range_array_iter_nth(&iter, n++, &uid)) {
swprintf(wuid, N_ELEMENTS(wuid), L"%u", uid);
Term *term = _CLNEW Term(_T("uid"), wuid);
query->add(_CLNEW TermQuery(term), true, BooleanClause::SHOULD);
_CLDECDELETE(term);
}
return true;
}
static int
lucene_index_expunge_record(struct lucene_index *index,
const struct fts_expunge_log_read_record *rec)
{
int ret;
if ((ret = lucene_index_open_search(index)) <= 0)
return ret;
BooleanQuery query;
BooleanQuery uids_query;
if (lucene_index_add_uid_filter(&uids_query, rec))
query.add(&uids_query, BooleanClause::MUST);
wchar_t wguid[MAILBOX_GUID_HEX_LENGTH + 1];
guid128_to_wguid(rec->mailbox_guid, wguid);
Term term(_T("box"), wguid);
TermQuery mailbox_query(&term);
query.add(&mailbox_query, BooleanClause::MUST);
try {
Hits *hits = index->searcher->search(&query);
for (size_t i = 0; i < hits->length(); i++) {
uint32_t uid;
if (lucene_doc_get_uid(index, &hits->doc(i),
&uid) < 0 ||
seq_range_exists(&rec->uids, uid))
index->reader->deleteDocument(hits->id(i));
}
_CLDELETE(hits);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "expunge search");
ret = -1;
}
return ret < 0 ? -1 : 0;
}
int lucene_index_expunge_from_log(struct lucene_index *index,
struct fts_expunge_log *log)
{
struct fts_expunge_log_read_ctx *ctx;
const struct fts_expunge_log_read_record *rec;
int ret = 0, ret2;
ctx = fts_expunge_log_read_begin(log);
while ((rec = fts_expunge_log_read_next(ctx)) != NULL) {
if (lucene_index_expunge_record(index, rec) < 0) {
ret = -1;
break;
}
}
try {
if (index->reader != NULL)
index->reader->close();
lucene_index_close(index);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "expunge delete");
ret = -1;
}
ret2 = fts_expunge_log_read_end(&ctx);
if (ret < 0 || ret2 < 0)
return -1;
return ret2;
}
int lucene_index_optimize(struct lucene_index *index)
{
int ret = 0;
if (!IndexReader::indexExists(index->path))
return 0;
if (IndexReader::isLocked(index->path))
IndexReader::unlock(index->path);
IndexWriter *writer = NULL;
try {
writer = _CLNEW IndexWriter(index->path, index->default_analyzer, false);
writer->optimize();
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "IndexWriter::optimize()");
ret = -1;
}
if (writer != NULL)
_CLDELETE(writer);
return ret;
}
// Mostly copy&pasted from CLucene's QueryParser
static Query* getFieldQuery(Analyzer *analyzer, const TCHAR* _field, const TCHAR* queryText, bool fuzzy) {
// Use the analyzer to get all the tokens, and then build a TermQuery,
// PhraseQuery, or nothing based on the term count
StringReader reader(queryText);
TokenStream* source = analyzer->tokenStream(_field, &reader);
CLVector<CL_NS(analysis)::Token*, Deletor::Object<CL_NS(analysis)::Token> > v;
CL_NS(analysis)::Token* t = NULL;
int32_t positionCount = 0;
bool severalTokensAtSamePosition = false;
while (true) {
t = _CLNEW Token();
try {
Token* _t = source->next(t);
if (_t == NULL) _CLDELETE(t);
}_CLCATCH_ERR(CL_ERR_IO, _CLLDELETE(source);_CLLDELETE(t);,{
t = NULL;
});
if (t == NULL)
break;
v.push_back(t);
if (t->getPositionIncrement() != 0)
positionCount += t->getPositionIncrement();
else
severalTokensAtSamePosition = true;
}
try {
source->close();
}
_CLCATCH_ERR_CLEANUP(CL_ERR_IO, {_CLLDELETE(source);_CLLDELETE(t);} ); /* cleanup */
_CLLDELETE(source);
if (v.size() == 0)
return NULL;
else if (v.size() == 1) {
Term* tm = _CLNEW Term(_field, v.at(0)->termBuffer());
Query* ret;
if (fuzzy)
ret = _CLNEW FuzzyQuery( tm );
else
ret = _CLNEW TermQuery( tm );
_CLDECDELETE(tm);
return ret;
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
BooleanQuery* q = _CLNEW BooleanQuery(true);
for(size_t i=0; i<v.size(); i++ ){
Term* tm = _CLNEW Term(_field, v.at(i)->termBuffer());
q->add(_CLNEW TermQuery(tm), true, BooleanClause::SHOULD);
_CLDECDELETE(tm);
}
return q;
}else {
MultiPhraseQuery* mpq = _CLNEW MultiPhraseQuery();
CLArrayList<Term*> multiTerms;
int32_t position = -1;
for (size_t i = 0; i < v.size(); i++) {
t = v.at(i);
if (t->getPositionIncrement() > 0 && multiTerms.size() > 0) {
ValueArray<Term*> termsArray(multiTerms.size());
multiTerms.toArray(termsArray.values);
mpq->add(&termsArray,position);
multiTerms.clear();
}
position += t->getPositionIncrement();
multiTerms.push_back(_CLNEW Term(_field, t->termBuffer()));
}
ValueArray<Term*> termsArray(multiTerms.size());
multiTerms.toArray(termsArray.values);
mpq->add(&termsArray,position);
return mpq;
}
}else {
PhraseQuery* pq = _CLNEW PhraseQuery();
int32_t position = -1;
for (size_t i = 0; i < v.size(); i++) {
t = v.at(i);
Term* tm = _CLNEW Term(_field, t->termBuffer());
position += t->getPositionIncrement();
pq->add(tm,position);
_CLDECDELETE(tm);
}
return pq;
}
}
}
static Query *
lucene_get_query_str(struct lucene_index *index,
const TCHAR *key, const char *str, bool fuzzy)
{
const TCHAR *wvalue = t_lucene_utf8_to_tchar(index, str, TRUE);
Analyzer *analyzer = guess_analyzer(index, str, strlen(str));
if (analyzer == NULL)
analyzer = index->default_analyzer;
return getFieldQuery(analyzer, key, wvalue, fuzzy);
}
static Query *
lucene_get_query(struct lucene_index *index,
const TCHAR *key, const struct mail_search_arg *arg)
{
return lucene_get_query_str(index, key, arg->value.str, arg->fuzzy);
}
static bool
lucene_add_definite_query(struct lucene_index *index,
ARRAY_TYPE(lucene_query) &queries,
struct mail_search_arg *arg, bool and_args)
{
Query *q;
if (arg->match_not && !and_args) {
/* FIXME: we could handle this by doing multiple queries.. */
return false;
}
switch (arg->type) {
case SEARCH_TEXT: {
BooleanQuery *bq = _CLNEW BooleanQuery();
Query *q1 = lucene_get_query(index, _T("hdr"), arg);
Query *q2 = lucene_get_query(index, _T("body"), arg);
if (q1 == NULL && q2 == NULL)
q = NULL;
else {
if (q1 != NULL)
bq->add(q1, true, BooleanClause::SHOULD);
if (q2 != NULL)
bq->add(q2, true, BooleanClause::SHOULD);
q = bq;
}
break;
}
case SEARCH_BODY:
q = lucene_get_query(index, _T("body"), arg);
break;
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
case SEARCH_HEADER_COMPRESS_LWSP:
if (!fts_header_want_indexed(arg->hdr_field_name) ||
*arg->value.str == '\0')
return false;
q = lucene_get_query(index,
t_lucene_utf8_to_tchar(index, t_str_lcase(arg->hdr_field_name), FALSE),
arg);
break;
default:
return false;
}
if (q == NULL) {
/* couldn't handle this search after all (e.g. trying to search
a stop word) */
return false;
}
struct lucene_query *lq = array_append_space(&queries);
lq->query = q;
if (!and_args)
lq->occur = BooleanClause::SHOULD;
else if (!arg->match_not)
lq->occur = BooleanClause::MUST;
else
lq->occur = BooleanClause::MUST_NOT;
return true;
}
static bool
lucene_add_maybe_query(struct lucene_index *index,
ARRAY_TYPE(lucene_query) &queries,
struct mail_search_arg *arg, bool and_args)
{
Query *q = NULL;
if (arg->match_not) {
/* FIXME: we could handle this by doing multiple queries.. */
return false;
}
switch (arg->type) {
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
case SEARCH_HEADER_COMPRESS_LWSP:
if (*arg->value.str == '\0') {
/* checking potential existence of the header name */
q = lucene_get_query_str(index, _T("hdr"),
t_str_lcase(arg->hdr_field_name), FALSE);
break;
}
if (fts_header_want_indexed(arg->hdr_field_name))
return false;
/* we can check if the search key exists in some header and
filter out the messages that have no chance of matching */
q = lucene_get_query(index, _T("hdr"), arg);
break;
default:
return false;
}
if (q == NULL) {
/* couldn't handle this search after all (e.g. trying to search
a stop word) */
return false;
}
struct lucene_query *lq = array_append_space(&queries);
lq->query = q;
if (!and_args)
lq->occur = BooleanClause::SHOULD;
else if (!arg->match_not)
lq->occur = BooleanClause::MUST;
else
lq->occur = BooleanClause::MUST_NOT;
return true;
return true;
}
static bool queries_have_non_must_nots(ARRAY_TYPE(lucene_query) &queries)
{
const struct lucene_query *lq;
array_foreach(&queries, lq) {
if (lq->occur != BooleanClause::MUST_NOT)
return TRUE;
}
return FALSE;
}
static void search_query_add(BooleanQuery &query,
ARRAY_TYPE(lucene_query) &queries)
{
BooleanQuery *search_query = _CLNEW BooleanQuery();
const struct lucene_query *lq;
if (queries_have_non_must_nots(queries)) {
array_foreach(&queries, lq)
search_query->add(lq->query, true, lq->occur);
query.add(search_query, true, BooleanClause::MUST);
} else {
array_foreach(&queries, lq)
search_query->add(lq->query, true, BooleanClause::SHOULD);
query.add(search_query, true, BooleanClause::MUST_NOT);
}
}
static int
lucene_index_search(struct lucene_index *index,
ARRAY_TYPE(lucene_query) &queries,
struct fts_result *result, ARRAY_TYPE(seq_range) *uids_r)
{
struct fts_score_map *score;
int ret = 0;
BooleanQuery query;
search_query_add(query, queries);
Term mailbox_term(_T("box"), index->mailbox_guid);
TermQuery mailbox_query(&mailbox_term);
query.add(&mailbox_query, BooleanClause::MUST);
try {
Hits *hits = index->searcher->search(&query);
uint32_t last_uid = 0;
if (result != NULL)
result->scores_sorted = true;
for (size_t i = 0; i < hits->length(); i++) {
uint32_t uid;
if (lucene_doc_get_uid(index, &hits->doc(i),
&uid) < 0) {
ret = -1;
break;
}
if (result != NULL) {
if (uid < last_uid)
result->scores_sorted = false;
last_uid = uid;
score = array_append_space(&result->scores);
score->uid = uid;
score->score = hits->score(i);
}
seq_range_array_add(uids_r, uid);
}
_CLDELETE(hits);
return ret;
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "search");
return -1;
}
}
int lucene_index_lookup(struct lucene_index *index,
struct mail_search_arg *args, bool and_args,
struct fts_result *result)
{
struct mail_search_arg *arg;
if (lucene_index_open_search(index) <= 0)
return -1;
ARRAY_TYPE(lucene_query) def_queries;
t_array_init(&def_queries, 16);
bool have_definites = false;
for (arg = args; arg != NULL; arg = arg->next) {
if (lucene_add_definite_query(index, def_queries, arg, and_args)) {
arg->match_always = true;
have_definites = true;
}
}
if (have_definites) {
if (lucene_index_search(index, def_queries, result,
&result->definite_uids) < 0)
return -1;
}
ARRAY_TYPE(lucene_query) maybe_queries;
t_array_init(&maybe_queries, 16);
bool have_maybies = false;
for (arg = args; arg != NULL; arg = arg->next) {
if (lucene_add_maybe_query(index, maybe_queries, arg, and_args)) {
arg->match_always = true;
have_maybies = true;
}
}
if (have_maybies) {
if (lucene_index_search(index, maybe_queries, NULL,
&result->maybe_uids) < 0)
return -1;
}
return 0;
}
static int
lucene_index_search_multi(struct lucene_index *index,
HASH_TABLE_TYPE(wguid_result) guids,
ARRAY_TYPE(lucene_query) &queries,
struct fts_multi_result *result)
{
struct fts_score_map *score;
int ret = 0;
BooleanQuery query;
search_query_add(query, queries);
BooleanQuery mailbox_query;
struct hash_iterate_context *iter;
wchar_t *key;
struct fts_result *value;
iter = hash_table_iterate_init(guids);
while (hash_table_iterate(iter, guids, &key, &value)) {
Term *term = _CLNEW Term(_T("box"), key);
TermQuery *q = _CLNEW TermQuery(term);
mailbox_query.add(q, true, BooleanClause::SHOULD);
}
hash_table_iterate_deinit(&iter);
query.add(&mailbox_query, BooleanClause::MUST);
try {
Hits *hits = index->searcher->search(&query);
for (size_t i = 0; i < hits->length(); i++) {
uint32_t uid;
Field *field = hits->doc(i).getField(_T("box"));
const TCHAR *box_guid = field == NULL ? NULL : field->stringValue();
if (box_guid == NULL) {
i_error("lucene: Corrupted FTS index %s: No mailbox for document",
index->path);
ret = -1;
break;
}
struct fts_result *br =
hash_table_lookup(guids, box_guid);
if (br == NULL) {
i_warning("lucene: Returned unexpected mailbox with GUID %ls", box_guid);
continue;
}
if (lucene_doc_get_uid(index, &hits->doc(i),
&uid) < 0) {
ret = -1;
break;
}
if (!array_is_created(&br->definite_uids)) {
p_array_init(&br->definite_uids, result->pool, 32);
p_array_init(&br->scores, result->pool, 32);
}
seq_range_array_add(&br->definite_uids, uid);
score = array_append_space(&br->scores);
score->uid = uid;
score->score = hits->score(i);
}
_CLDELETE(hits);
return ret;
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "multi search");
return -1;
}
}
int lucene_index_lookup_multi(struct lucene_index *index,
HASH_TABLE_TYPE(wguid_result) guids,
struct mail_search_arg *args, bool and_args,
struct fts_multi_result *result)
{
struct mail_search_arg *arg;
if (lucene_index_open_search(index) <= 0)
return -1;
ARRAY_TYPE(lucene_query) def_queries;
t_array_init(&def_queries, 16);
bool have_definites = false;
for (arg = args; arg != NULL; arg = arg->next) {
if (lucene_add_definite_query(index, def_queries, arg, and_args)) {
arg->match_always = true;
have_definites = true;
}
}
if (have_definites) {
if (lucene_index_search_multi(index, guids,
def_queries, result) < 0)
return -1;
}
return 0;
}
struct lucene_index_iter {
struct lucene_index *index;
struct lucene_index_record rec;
Term *term;
WildcardQuery *query;
Sort *sort;
Hits *hits;
size_t i;
bool failed;
};
struct lucene_index_iter *
lucene_index_iter_init(struct lucene_index *index)
{
static const TCHAR *sort_fields[] = { _T("box"), _T("uid"), NULL };
struct lucene_index_iter *iter;
int ret;
iter = i_new(struct lucene_index_iter, 1);
iter->index = index;
if ((ret = lucene_index_open_search(index)) <= 0) {
if (ret < 0)
iter->failed = true;
return iter;
}
iter->term = _CLNEW Term(_T("box"), _T("*"));
iter->query = _CLNEW WildcardQuery(iter->term);
iter->sort = _CLNEW Sort(sort_fields);
try {
iter->hits = index->searcher->search(iter->query, iter->sort);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "rescan search");
iter->failed = true;
}
return iter;
}
const struct lucene_index_record *
lucene_index_iter_next(struct lucene_index_iter *iter)
{
if (iter->hits == NULL)
return NULL;
if (iter->i == iter->hits->length())
return NULL;
Document *doc = &iter->hits->doc(iter->i);
iter->i++;
memset(&iter->rec, 0, sizeof(iter->rec));
(void)fts_lucene_get_mailbox_guid(iter->index, doc,
iter->rec.mailbox_guid);
(void)lucene_doc_get_uid(iter->index, doc, &iter->rec.uid);
return &iter->rec;
}
int lucene_index_iter_deinit(struct lucene_index_iter **_iter)
{
struct lucene_index_iter *iter = *_iter;
int ret = iter->failed ? -1 : 0;
*_iter = NULL;
if (iter->hits != NULL)
_CLDELETE(iter->hits);
if (iter->query != NULL) {
_CLDELETE(iter->query);
_CLDELETE(iter->sort);
_CLDELETE(iter->term);
}
i_free(iter);
return ret;
}
void lucene_shutdown(void)
{
_lucene_shutdown();
}