lucene-wrapper.cc revision 06ed0c58ea392df22ccf4868aac494831ea756e1
/* Copyright (c) 2006-2010 Dovecot authors, see the included COPYING file */
extern "C" {
#include "lib.h"
#include "array.h"
#include "unichar.h"
#include "hash.h"
#include "hex-binary.h"
#include "unlink-directory.h"
#include "mail-index.h"
#include "mail-search.h"
#include "mail-namespace.h"
#include "mail-storage.h"
#include "fts-expunge-log.h"
#include "fts-lucene-plugin.h"
#include "lucene-wrapper.h"
#ifdef HAVE_LUCENE_TEXTCAT
# include <libtextcat/textcat.h>
#endif
};
#include <CLucene.h>
#include "SnowballAnalyzer.h"
/* Lucene's default is 10000. Use it here also.. */
#define MAX_TERMS_PER_DOCUMENT 10000
#define FTS_LUCENE_MAX_SEARCH_TERMS 1000
#define LUCENE_LOCK_OVERRIDE_SECS 60
using namespace lucene::queryParser;
struct lucene_analyzer {
char *lang;
};
struct lucene_index {
char *path;
struct mailbox_list *list;
struct fts_lucene_settings set;
};
struct rescan_context {
struct lucene_index *index;
int box_ret;
struct hash_table *guids;
struct seq_range_iter uids_iter;
unsigned int uids_iter_n;
bool warned;
};
static bool textcat_broken = FALSE;
static int textcat_refcount = 0;
struct hash_table *guids);
struct mailbox_list *list,
const struct fts_lucene_settings *set)
{
struct lucene_index *index;
unsigned int len;
else {
/* this is valid only for doveadm dump, so it doesn't matter */
}
#ifdef HAVE_LUCENE_STEMMER
#else
#endif
return index;
}
{
}
{
struct lucene_analyzer *a;
}
#ifdef HAVE_LUCENE_TEXTCAT
#endif
}
}
{
i_unreached();
}
static const wchar_t *t_lucene_utf8_to_tchar(const char *str)
{
i_unreached();
(void)array_append_space(&dest_arr);
return (const wchar_t *)ret;
}
const wchar_t guid[MAILBOX_GUID_HEX_LENGTH])
{
MAILBOX_GUID_HEX_LENGTH * sizeof(wchar_t));
}
{
}
const char *msg)
{
i_error("lucene index %s: %s failed (#%d): %s",
/* delete corrupted index. most IO errors are also about
missing files and other such corruption.. */
}
}
{
return 1;
return 0;
try {
} catch (CLuceneError &err) {
return -1;
}
return 1;
}
{
int ret;
return 1;
return ret;
return 1;
}
static int
{
i_error("lucene: Corrupted FTS index %s: No UID for document",
return -1;
}
while (*uid != 0) {
uid++;
}
return 0;
}
{
int ret = 0;
*last_uid_r = 0;
return ret;
try {
&uid) < 0) {
ret = -1;
break;
}
}
} catch (CLuceneError &err) {
ret = -1;
}
*last_uid_r = last_uid;
return ret;
}
{
int ret;
return -1;
if (ret == 0) {
*count_r = 0;
return 0;
}
}
return 0;
}
{
struct fts_index_header hdr;
int ret = 0;
if (ret != 0)
return ret;
/* settings changed, rebuild index */
ret = -1;
} else {
}
return ret;
}
{
const char *lock_path;
}
if (lucene_settings_check(index) < 0)
return -1;
try {
!exists);
} catch (CLuceneError &err) {
return -1;
}
return 0;
}
#ifdef HAVE_LUCENE_TEXTCAT
{
const struct lucene_analyzer *a;
struct lucene_analyzer new_analyzer;
return a->analyzer;
}
return new_analyzer.analyzer;
}
{
unsigned int len;
if (textcat_dir == NULL)
return NULL;
/* textcat really wants the '/' suffix */
}
static Analyzer *
{
const char *lang;
if (textcat_broken)
return NULL;
return NULL;
}
}
/* try to guess the language */
return NULL;
return index->default_analyzer;
}
#else
static Analyzer *
{
return NULL;
}
#endif
{
int ret = 0;
return 0;
try {
} catch (CLuceneError &err) {
ret = -1;
}
return ret;
}
const char *hdr_name)
{
wchar_t id[MAX_INT_STRLEN];
if (lucene_index_build_flush(index) < 0)
return -1;
index->doc->add(*_CLNEW Field(_T("box"), index->mailbox_guid, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
}
/* hdr_name should be ASCII, but don't break in case it isn't */
lucene_utf8_n_to_tchar((const unsigned char *)hdr_name,
} else if (size > 0) {
}
return 0;
}
{
int ret = 0;
/* no changes. */
return 0;
}
return -1;
}
if (lucene_index_build_flush(index) < 0)
ret = -1;
try {
} catch (CLuceneError &err) {
ret = -1;
}
return ret;
}
static int
{
unsigned int i;
for (i = 0; i < sizeof(src_chars)-1; i++) {
else
return -1;
}
if (src[i] != '\0')
return -1;
src_chars[i] = '\0';
}
static int
{
struct mailbox_status status;
return -1;
} T_END;
return 0;
}
{
int ret;
return ret;
}
static int
{
i_error("lucene: Corrupted FTS index %s: No mailbox for document",
return -1;
}
i_error("lucene: Corrupted FTS index %s: "
return -1;
}
return 0;
}
static int
{
int ret;
return 0;
/* same as last one */
}
(enum mailbox_flags)0);
enum mail_error error;
const char *errstr;
if (error == MAIL_ERROR_NOTFOUND)
ret = 0;
else {
i_error("lucene: Couldn't open mailbox %s: %s",
ret = -1;
}
return ret;
}
i_error("lucene: Failed to sync mailbox %s: %s",
return -1;
}
ctx->last_existing_uid = 0;
ctx->uids_iter_n = 0;
return 1;
}
static int
{
return 0;
&idx_uid)) {
if (idx_uid == lucene_uid) {
ctx->uids_iter_n++;
return 1;
}
if (idx_uid < lucene_uid) {
/* lucene is missing an UID from the middle. delete
the rest of the messages from this mailbox and
reindex. */
i_warning("lucene: Mailbox %s "
"missing UIDs in the middle",
}
} else {
/* UID has been expunged from index. delete from
lucene as well. */
}
return 0;
} else {
/* the rest of the messages have been expunged from index */
return 0;
}
}
struct hash_table *guids)
{
const enum mailbox_list_iter_flags iter_flags =
(enum mailbox_list_iter_flags)
struct mailbox_list_iterate_context *iter;
const struct mailbox_info *info;
struct mailbox_metadata metadata;
struct fts_index_header hdr;
(enum mailbox_flags)0);
&metadata) == 0 &&
/* this mailbox had no records in lucene index.
make sure its last indexed uid is 0 */
}
mailbox_free(&box);
}
(void)mailbox_list_iter_deinit(&iter);
}
{
struct rescan_context ctx;
bool failed = false;
int ret;
return ret;
if (ret > 0) try {
if (ret > 0)
if (ret < 0)
failed = true;
else if (ret == 0)
}
} catch (CLuceneError &err) {
failed = true;
}
rescan_finish(&ctx);
return failed ? -1 : 0;
}
{
unsigned char guid_hex[MAILBOX_GUID_HEX_LENGTH];
unsigned int i;
for (i = 0; i < MAILBOX_GUID_HEX_LENGTH; i++)
wguid_hex[i] = '\0';
}
static bool
const struct fts_expunge_log_read_record *rec)
{
struct seq_range_iter iter;
wchar_t wuid[MAX_INT_STRLEN];
unsigned int n;
/* RangeQuery and WildcardQuery work by enumerating through all terms
that match them, and then adding TermQueries for them. So we can
simply do the same directly, and if it looks like there are too
many terms just go through everything. */
return false;
}
return true;
}
static int
const struct fts_expunge_log_read_record *rec)
{
int ret;
return ret;
try {
&uid) < 0 ||
}
} catch (CLuceneError &err) {
ret = -1;
}
return ret < 0 ? -1 : 0;
}
struct fts_expunge_log *log)
{
struct fts_expunge_log_read_ctx *ctx;
const struct fts_expunge_log_read_record *rec;
ret = -1;
break;
}
}
try {
} catch (CLuceneError &err) {
ret = -1;
}
return -1;
return ret2;
}
{
int ret = 0;
return 0;
try {
} catch (CLuceneError &err) {
ret = -1;
}
return ret;
}
// Mostly copy&pasted from CLucene's QueryParser
static Query* getFieldQuery(Analyzer *analyzer, const TCHAR* _field, const TCHAR* queryText, bool fuzzy) {
// Use the analyzer to get all the tokens, and then build a TermQuery,
// PhraseQuery, or nothing based on the term count
int32_t positionCount = 0;
bool severalTokensAtSamePosition = false;
while (true) {
try {
t = NULL;
});
if (t == NULL)
break;
v.push_back(t);
if (t->getPositionIncrement() != 0)
positionCount += t->getPositionIncrement();
else
severalTokensAtSamePosition = true;
}
try {
}
if (v.size() == 0)
return NULL;
else if (v.size() == 1) {
if (fuzzy)
else
return ret;
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
}
return q;
}else {
t = v.at(i);
multiTerms.clear();
}
position += t->getPositionIncrement();
}
return mpq;
}
}else {
t = v.at(i);
position += t->getPositionIncrement();
}
return pq;
}
}
}
static Query *
{
}
static Query *
{
}
static bool
{
Query *q;
/* FIXME: we could handle this by doing multiple queries.. */
return false;
}
case SEARCH_TEXT: {
q = NULL;
else {
q = bq;
}
break;
}
case SEARCH_BODY:
break;
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
return false;
/* FIXME: handle existence of a search key */
return false;
}
q = lucene_get_query(index,
arg);
break;
default:
return false;
}
if (q == NULL) {
/* couldn't handle this search after all (e.g. trying to search
a stop word) */
return false;
}
if (!and_args)
else
return true;
}
static bool
{
Query *q;
/* FIXME: we could handle this by doing multiple queries.. */
return false;
}
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
return false;
/* we can check if the search key exists in some header and
filter out the messages that have no chance of matching */
else {
/* checking potential existence of the header name */
}
break;
default:
return false;
}
if (q == NULL) {
/* couldn't handle this search after all (e.g. trying to search
a stop word) */
return false;
}
if (!and_args)
else
return true;
}
static int
{
struct fts_score_map *score;
int ret = 0;
try {
result->scores_sorted = true;
&uid) < 0) {
ret = -1;
break;
}
result->scores_sorted = false;
}
}
return ret;
} catch (CLuceneError &err) {
return -1;
}
}
struct fts_result *result)
{
struct mail_search_arg *arg;
if (lucene_index_open_search(index) <= 0)
return -1;
bool have_definites = false;
arg->match_always = true;
have_definites = true;
}
}
if (have_definites) {
&result->definite_uids) < 0)
return -1;
}
bool have_maybies = false;
arg->match_always = true;
have_maybies = true;
}
}
if (have_maybies) {
&result->maybe_uids) < 0)
return -1;
}
return 0;
}
static int
{
struct fts_score_map *score;
int ret = 0;
struct hash_iterate_context *iter;
}
try {
i_error("lucene: Corrupted FTS index %s: No mailbox for document",
ret = -1;
break;
}
continue;
}
&uid) < 0) {
ret = -1;
break;
}
}
}
return ret;
} catch (CLuceneError &err) {
return -1;
}
}
struct hash_table *guids,
struct fts_multi_result *result)
{
struct mail_search_arg *arg;
if (lucene_index_open_search(index) <= 0)
return -1;
bool have_definites = false;
arg->match_always = true;
have_definites = true;
}
}
if (have_definites) {
return -1;
}
return 0;
}
struct lucene_index_iter {
struct lucene_index *index;
struct lucene_index_record rec;
size_t i;
bool failed;
};
struct lucene_index_iter *
{
struct lucene_index_iter *iter;
int ret;
if (ret < 0)
return iter;
}
try {
} catch (CLuceneError &err) {
}
return iter;
}
const struct lucene_index_record *
{
return NULL;
return NULL;
iter->i++;
}
{
}
return ret;
}
void lucene_shutdown(void)
{
}