lucene-wrapper.cc revision acc72c40c5bfe818013e0ae9c9e73eb90ae8fbb1
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen/* Copyright (c) 2006-2010 Dovecot authors, see the included COPYING file */
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen/* Lucene's default is 10000. Use it here also.. */
72cbf33ae81fde08384d30c779ff540752d9256cTimo SirainenARRAY_DEFINE_TYPE(lucene_query, struct lucene_query);
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen wchar_t mailbox_guid[MAILBOX_GUID_HEX_LENGTH + 1];
50c4a9739b55370b1d3950d7b3ec2f7cd2ed5f49Timo Sirainen HASH_TABLE(uint8_t *, uint8_t *) seen_mailbox_guids;
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainenstatic void rescan_clear_unseen_mailboxes(struct lucene_index *index,
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainenstruct lucene_index *lucene_index_init(const char *path,
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen unsigned int len;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen mailbox_list_get_namespace(list)->user->default_normalizer;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen /* this is valid only for doveadm dump, so it doesn't matter */
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen _CLNEW snowball::SnowballAnalyzer(index->normalizer,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->default_analyzer = _CLNEW standard::StandardAnalyzer();
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenvoid lucene_index_close(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenvoid lucene_index_deinit(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen array_foreach_modifiable(&index->analyzers, a) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (--textcat_refcount == 0 && textcat != NULL) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic void lucene_data_translate(struct lucene_index *index,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const char *whitespace_chars = index->set.whitespace_chars;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen unsigned int i;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen for (i = 0; i < len; i++) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (strchr(whitespace_chars, data[i]) != NULL)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenvoid lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_assert(sizeof(wchar_t) == sizeof(unichar_t));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen buffer_create_from_data(&buf, dest, sizeof(wchar_t) * destsize);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen array_create_from_buffer(&dest_arr, &buf, sizeof(wchar_t));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (uni_utf8_to_ucs4_n(src, srcsize, &dest_arr) < 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_assert(array_count(&dest_arr)+1 == destsize);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic const wchar_t *
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainent_lucene_utf8_to_tchar(struct lucene_index *index,
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen unsigned int len;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_assert(sizeof(wchar_t) == sizeof(unichar_t));
0a51697f82fbd45a511710479e99efd42dc18453Timo Sirainen chars = array_get_modifiable(&dest_arr, &len);
0a51697f82fbd45a511710479e99efd42dc18453Timo Sirainenvoid lucene_index_select_mailbox(struct lucene_index *index,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen MAILBOX_GUID_HEX_LENGTH * sizeof(wchar_t));
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->mailbox_guid[MAILBOX_GUID_HEX_LENGTH] = '\0';
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenvoid lucene_index_unselect_mailbox(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen memset(index->mailbox_guid, 0, sizeof(index->mailbox_guid));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic void lucene_handle_error(struct lucene_index *index, CLuceneError &err,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const char *msg)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_error("lucene index %s: %s failed (#%d): %s",
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen /* delete corrupted index. most IO errors are also about
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen missing files and other such corruption.. */
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_error("unlink_directory(%s) failed: %m", index->path);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainenstatic int lucene_index_open(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->reader = IndexReader::open(index->path);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen lucene_handle_error(index, err, "IndexReader::open()");
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic int lucene_index_open_search(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->searcher = _CLNEW IndexSearcher(index->reader);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenlucene_doc_get_uid(struct lucene_index *index, Document *doc, uint32_t *uid_r)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen const TCHAR *uid = field == NULL ? NULL : field->stringValue();
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen i_error("lucene: Corrupted FTS index %s: No UID for document",
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen while (*uid != 0) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenlucene_doc_get_part(struct lucene_index *index, Document *doc)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen const TCHAR *part = field == NULL ? NULL : field->stringValue();
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen while (*part != 0) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenint lucene_index_get_last_uid(struct lucene_index *index, uint32_t *last_uid_r)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen if ((ret = lucene_index_open_search(index)) <= 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen Term mailbox_term(_T("box"), index->mailbox_guid);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen lucene_handle_error(index, err, "last_uid search");
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainenint lucene_index_get_doc_count(struct lucene_index *index, uint32_t *count_r)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic int lucene_settings_check(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen set_checksum = fts_lucene_settings_checksum(&index->set);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen ret = fts_index_have_compatible_settings(index->list, set_checksum);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen /* settings changed, rebuild index */
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (unlink_directory(index->path, UNLINK_DIRECTORY_FLAG_RMDIR) < 0) {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen i_error("unlink_directory(%s) failed: %m", index->path);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenint lucene_index_build_init(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen lock_path = t_strdup_printf("%s/write.lock", index->path);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen st.st_mtime < time(NULL) - LUCENE_LOCK_OVERRIDE_SECS) {
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen bool exists = IndexReader::indexExists(index->path);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen index->writer = _CLNEW IndexWriter(index->path,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen lucene_handle_error(index, err, "IndexWriter()");
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen index->writer->setMaxFieldLength(MAX_TERMS_PER_DOCUMENT);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic Analyzer *get_analyzer(struct lucene_index *index, const char *lang)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen normalizer_func_t *normalizer = index->normalizer;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const struct lucene_analyzer *a;
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen memset(&new_analyzer, 0, sizeof(new_analyzer));
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen _CLNEW snowball::SnowballAnalyzer(normalizer, lang);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen array_append_i(&index->analyzers.arr, &new_analyzer, 1);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic void *textcat_init(struct lucene_index *index)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const char *textcat_dir = index->set.textcat_dir;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen unsigned int len;
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen /* textcat really wants the '/' suffix */
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen textcat_dir = t_strconcat(textcat_dir, "/", NULL);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen return special_textcat_Init(index->set.textcat_conf, textcat_dir);
50c4a9739b55370b1d3950d7b3ec2f7cd2ed5f49Timo Sirainenguess_analyzer(struct lucene_index *index, const void *data, size_t size)
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen /* try to guess the language */
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen lang = textcat_Classify(textcat, (const char *)data,
a2ce2eb4c266e2854fd34416ea5cfbe05dfd3971Timo Sirainen if (strcmp(lang, index->set.default_language) == 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenguess_analyzer(struct lucene_index *index ATTR_UNUSED,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const void *data ATTR_UNUSED, size_t size ATTR_UNUSED)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainenstatic int lucene_index_build_flush(struct lucene_index *index)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen lucene_handle_error(index, err, "IndexWriter::addDocument()");
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainenint lucene_index_build_more(struct lucene_index *index, uint32_t uid,
31854ec69857e384882bcade5cf0c5dea8abf230Timo Sirainen if (uid != index->prev_uid || part_idx != index->prev_part_idx) {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->doc->add(*_CLNEW Field(_T("uid"), id, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen swprintf(id, N_ELEMENTS(id), L"%u", part_idx);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->doc->add(*_CLNEW Field(_T("part"), id, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen index->doc->add(*_CLNEW Field(_T("box"), index->mailbox_guid, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen buffer_set_used_size(index->normalizer_buf, 0);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen index->normalizer(data, size, index->normalizer_buf);
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen data = (const unsigned char *)index->normalizer_buf->data;
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen lucene_utf8_n_to_tchar(data, size, dest, datasize);
5ebddd2d812296900bc255b24bcd508878784c37Timo Sirainen /* hdr_name should be ASCII, but don't break in case it isn't */
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen lucene_utf8_n_to_tchar((const unsigned char *)hdr_name,
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen index->doc->add(*_CLNEW Field(_T("hdr"), wname, Field::STORE_NO | Field::INDEX_TOKENIZED));
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen index->doc->add(*_CLNEW Field(_T("hdr"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen index->doc->add(*_CLNEW Field(wname, dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen } else if (size > 0) {
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen index->cur_analyzer = guess_analyzer(index, data, size);
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen index->doc->add(*_CLNEW Field(_T("body"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainenint lucene_index_build_deinit(struct lucene_index *index)
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen /* no changes. */
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen lucene_handle_error(index, err, "IndexWriter::close()");
4bbee99b3aef449a9a2a11a5b5cf1ca486915c49Timo Sirainenwcharguid_to_guid(guid_128_t dest, const wchar_t *src)
e015e2f7e7f48874495f9df8b0dd192b7ffcb5ccTimo Sirainen unsigned int i;
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainen buffer_create_from_data(&buf, dest, GUID_128_SIZE);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenrescan_get_uids(struct mailbox *box, ARRAY_TYPE(seq_range) *uids)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (mailbox_get_status(box, STATUS_MESSAGES, &status) < 0)
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen seq_range_array_add_range(&seqs, 1, status.messages);
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenstatic int rescan_finish(struct rescan_context *ctx)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen ret = fts_index_set_last_uid(ctx->box, ctx->last_existing_uid);
992a13add4eea0810e4db0f042a595dddf85536aTimo Sirainenfts_lucene_get_mailbox_guid(struct lucene_index *index, Document *doc,
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen const TCHAR *box_guid = field == NULL ? NULL : field->stringValue();
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen i_error("lucene: Corrupted FTS index %s: No mailbox for document",
ca316aeb7648d3f1bcf45231f73ddeb1b67a6961Timo Sirainen if (wcharguid_to_guid(guid_r, box_guid) < 0) {
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen "box field not in expected format", index->path);
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainenrescan_open_mailbox(struct rescan_context *ctx, Document *doc)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (fts_lucene_get_mailbox_guid(ctx->index, doc, guid) < 0)
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen if (memcmp(guid, ctx->box_guid, sizeof(guid)) == 0) {
531fa12126fc7abf63244a7ed4505896a8694206Timo Sirainen /* same as last one */
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen memcpy(ctx->box_guid, guid, sizeof(ctx->box_guid));
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainen hash_table_insert(ctx->seen_mailbox_guids, guidp, guidp);
e1f05b193ac1edd3267294e9501e8063aa0f791aTimo Sirainen ctx->box = mailbox_alloc_guid(ctx->index->list, guid,
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen errstr = mailbox_get_last_error(ctx->box, &error);
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen i_error("lucene: Couldn't open mailbox %s: %s",
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen if (mailbox_sync(ctx->box, (enum mailbox_sync_flags)0) < 0) {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen i_error("lucene: Failed to sync mailbox %s: %s",
&idx_uid)) {
(enum mailbox_list_iter_flags)
(enum mailbox_flags)0);
&metadata) == 0 &&
bool failed = false;
int ret;
return ret;
if (ret > 0) try {
if (ret > 0)
if (ret < 0)
failed = true;
else if (ret == 0)
failed = true;
for (i = 0; i < MAILBOX_GUID_HEX_LENGTH; i++)
int ret;
return ret;
&uid) < 0 ||
return ret2;
int ret = 0;
return ret;
static Query* getFieldQuery(Analyzer *analyzer, const TCHAR* _field, const TCHAR* queryText, bool fuzzy) {
bool severalTokensAtSamePosition = false;
t = NULL;
if (t == NULL)
v.push_back(t);
if (t->getPositionIncrement() != 0)
severalTokensAtSamePosition = true;
if (v.size() == 0)
return NULL;
if (fuzzy)
return ret;
if (severalTokensAtSamePosition) {
t = v.at(i);
return mpq;
t = v.at(i);
return pq;
static Query *
static Query *
Query *q;
case SEARCH_TEXT: {
q = NULL;
q = bq;
case SEARCH_BODY:
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
arg);
if (q == NULL) {
if (!and_args)
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
if (q == NULL) {
if (!and_args)
return TRUE;
return FALSE;
int ret = 0;
&uid) < 0) {
return ret;
bool have_definites = false;
have_definites = true;
if (have_definites) {
if (have_definites) {
bool have_maybies = false;
have_maybies = true;
if (have_maybies) {
int ret = 0;
&uid) < 0) {
return ret;
bool have_definites = false;
have_definites = true;
if (have_definites) {
struct lucene_index_iter {
size_t i;
bool failed;
struct lucene_index_iter *
int ret;
if (ret < 0)
return iter;
return iter;
const struct lucene_index_record *
return NULL;
return NULL;
iter->i++;
return ret;
void lucene_shutdown(void)