fts-backend-solr.c revision f56a965dc18dcd1bdf3daa29199cafde15e6ea8a
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen/* Copyright (c) 2006-2008 Dovecot authors, see the included COPYING file */
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen#include "lib.h"
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen#include "array.h"
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen#include "str.h"
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen#include "mail-storage-private.h"
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen#include "mail-namespace.h"
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen#include "solr-connection.h"
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen#include "fts-solr-plugin.h"
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen#include <curl/curl.h>
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen#define SOLR_CMDBUF_SIZE (1024*64)
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainenstruct solr_fts_backend_build_context {
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen struct fts_backend_build_context ctx;
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen struct solr_connection_post *post;
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen uint32_t prev_uid, uid_validity;
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen string_t *cmd;
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen bool headers;
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen};
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainenstatic struct solr_connection *solr_conn = NULL;
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainenstatic void solr_quote_str(string_t *dest, const char *str)
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen{
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen solr_connection_quote_str(solr_conn, dest, str);
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen}
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainenstatic void
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainenxml_encode_data(string_t *dest, const unsigned char *data, unsigned int len)
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen{
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen unsigned int i;
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen for (i = 0; i < len; i++) {
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen switch (data[i]) {
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen case '&':
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen str_append(dest, "&amp;");
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen break;
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen case '<':
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen str_append(dest, "&lt;");
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen break;
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen case '>':
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen str_append(dest, "&gt;");
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen break;
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen default:
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen str_append_c(dest, data[i]);
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen break;
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen }
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen }
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen}
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainenstatic void xml_encode(string_t *dest, const char *str)
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen{
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen xml_encode_data(dest, (const unsigned char *)str, strlen(str));
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen}
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainenstatic struct fts_backend *
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainenfts_backend_solr_init(struct mailbox *box ATTR_UNUSED)
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen{
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen const struct fts_solr_settings *set = &fts_solr_settings;
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen struct fts_backend *backend;
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen if (solr_conn == NULL)
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen solr_conn = solr_connection_init(set->url, set->debug);
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen backend = i_new(struct fts_backend, 1);
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen *backend = fts_backend_solr;
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen if (set->substring_search)
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen backend->flags |= FTS_BACKEND_FLAG_SUBSTRING_LOOKUPS;
578ef2538ccf42e2a48234c24a8b709397101d88Timo Sirainen return backend;
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen}
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainenstatic void fts_backend_solr_deinit(struct fts_backend *backend)
a3fe8c0c54d87822f4b4f8f0d10caac611861b2bTimo Sirainen{
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen i_free(backend);
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen}
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen
f50ea0370137dd93d9953d91ea73486ca0784de9Timo Sirainenstatic int fts_backend_solr_get_last_uid(struct fts_backend *backend,
f50ea0370137dd93d9953d91ea73486ca0784de9Timo Sirainen uint32_t *last_uid_r)
c664d0da658c8d3200d88ea3c4cd580afd33fa73Timo Sirainen{
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen struct mailbox_status status;
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen ARRAY_TYPE(seq_range) uids;
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen const struct seq_range *uidvals;
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen unsigned int count;
578ef2538ccf42e2a48234c24a8b709397101d88Timo Sirainen string_t *str;
578ef2538ccf42e2a48234c24a8b709397101d88Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen str = t_str_new(256);
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen str_append(str, "fl=uid&rows=1&sort=uid%20desc&q=");
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen mailbox_get_status(backend->box, STATUS_UIDVALIDITY, &status);
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen str_printfa(str, "uidv:%u%%20box:", status.uidvalidity);
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen solr_quote_str(str, backend->box->name);
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen str_append(str, "%20user:");
cf0ad1a0bddb0787f3d7b408a96d721a8b2a98a3Timo Sirainen solr_quote_str(str, backend->box->storage->ns->user->username);
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen
f50ea0370137dd93d9953d91ea73486ca0784de9Timo Sirainen t_array_init(&uids, 1);
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen if (solr_connection_select(solr_conn, str_c(str), &uids, NULL) < 0)
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen return -1;
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen uidvals = array_get(&uids, &count);
acc039dfc0b0f4588cf2feec04727b61e1c672a1Timo Sirainen if (count == 0) {
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen /* nothing indexed yet for this mailbox */
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen *last_uid_r = 0;
} else if (count == 1 && uidvals[0].seq1 == uidvals[0].seq2) {
*last_uid_r = uidvals[0].seq1;
} else {
i_error("fts_solr: Last UID lookup returned multiple rows");
return -1;
}
return 0;
}
static int
fts_backend_solr_build_init(struct fts_backend *backend, uint32_t *last_uid_r,
struct fts_backend_build_context **ctx_r)
{
struct solr_fts_backend_build_context *ctx;
struct mailbox_status status;
*last_uid_r = (uint32_t)-1;
ctx = i_new(struct solr_fts_backend_build_context, 1);
ctx->ctx.backend = backend;
ctx->post = solr_connection_post_begin(solr_conn);
ctx->cmd = str_new(default_pool, SOLR_CMDBUF_SIZE);
mailbox_get_status(backend->box, STATUS_UIDVALIDITY, &status);
ctx->uid_validity = status.uidvalidity;
*ctx_r = &ctx->ctx;
return 0;
}
static int
fts_backend_solr_build_more(struct fts_backend_build_context *_ctx,
uint32_t uid, const unsigned char *data,
size_t size, bool headers)
{
struct solr_fts_backend_build_context *ctx =
(struct solr_fts_backend_build_context *)_ctx;
struct mailbox *box = _ctx->backend->box;
string_t *cmd = ctx->cmd;
/* body comes first, then headers */
if (ctx->prev_uid != uid) {
/* uid changed */
if (ctx->prev_uid == 0)
str_append(cmd, "<add>");
else
str_append(cmd, "</field></doc>");
ctx->prev_uid = uid;
str_printfa(cmd, "<doc>"
"<field name=\"uid\">%u</field>"
"<field name=\"uidv\">%u</field>",
uid, ctx->uid_validity);
str_append(cmd, "<field name=\"box\">");
xml_encode(cmd, box->name);
str_append(cmd, "</field><field name=\"user\">");
xml_encode(cmd, box->storage->ns->user->username);
str_printfa(cmd, "</field><field name=\"id\">%u/%u/",
uid, ctx->uid_validity);
xml_encode(cmd, box->storage->ns->user->username);
str_append_c(cmd, '/');
xml_encode(cmd, box->name);
str_append(cmd, "</field>");
ctx->headers = headers;
if (headers) {
str_append(cmd, "<field name=\"hdr\">");
} else {
str_append(cmd, "<field name=\"body\">");
}
} else if (headers && !ctx->headers) {
str_append(cmd, "</field><field name=\"hdr\">");
} else {
i_assert(!(!headers && ctx->headers));
}
xml_encode_data(cmd, data, size);
if (str_len(cmd) > SOLR_CMDBUF_SIZE-128) {
solr_connection_post_more(ctx->post, str_data(cmd),
str_len(cmd));
str_truncate(cmd, 0);
}
return 0;
}
static int
fts_backend_solr_build_deinit(struct fts_backend_build_context *_ctx)
{
struct solr_fts_backend_build_context *ctx =
(struct solr_fts_backend_build_context *)_ctx;
int ret = 0;
if (ctx->prev_uid != 0) {
str_append(ctx->cmd, "</field></doc></add>");
solr_connection_post_more(ctx->post, str_data(ctx->cmd),
str_len(ctx->cmd));
ret = solr_connection_post_end(ctx->post);
/* commit and wait until the documents we just indexed are
visible to the following search */
if (solr_connection_post(solr_conn,
"<commit waitFlush=\"false\" "
"waitSearcher=\"true\"/>") < 0)
ret = -1;
}
str_free(&ctx->cmd);
i_free(ctx);
return ret;
}
static void
fts_backend_solr_expunge(struct fts_backend *backend ATTR_UNUSED,
struct mail *mail)
{
struct mailbox_status status;
mailbox_get_status(mail->box, STATUS_UIDVALIDITY, &status);
T_BEGIN {
string_t *cmd;
cmd = t_str_new(256);
str_printfa(cmd, "<delete><id>%u/%u/",
mail->uid, status.uidvalidity);
xml_encode(cmd, mail->box->storage->ns->user->username);
str_append_c(cmd, '/');
xml_encode(cmd, mail->box->name);
str_append(cmd, "</id></delete>");
(void)solr_connection_post(solr_conn, str_c(cmd));
} T_END;
}
static void
fts_backend_solr_expunge_finish(struct fts_backend *backend ATTR_UNUSED,
struct mailbox *box ATTR_UNUSED,
bool committed ATTR_UNUSED)
{
solr_connection_post(solr_conn,
"<commit waitFlush=\"false\" waitSearcher=\"false\"/>");
}
static int fts_backend_solr_lock(struct fts_backend *backend ATTR_UNUSED)
{
return 1;
}
static void fts_backend_solr_unlock(struct fts_backend *backend ATTR_UNUSED)
{
}
static int fts_backend_solr_lookup(struct fts_backend_lookup_context *ctx,
ARRAY_TYPE(seq_range) *definite_uids,
ARRAY_TYPE(seq_range) *maybe_uids,
ARRAY_TYPE(fts_score_map) *scores)
{
struct mailbox *box = ctx->backend->box;
const struct fts_backend_lookup_field *fields;
unsigned int i, count;
struct mailbox_status status;
string_t *str;
mailbox_get_status(box, STATUS_UIDVALIDITY, &status);
str = t_str_new(256);
str_printfa(str, "fl=uid,score&rows=%u&sort=uid%%20asc&q=",
status.uidnext);
/* build a lucene search query from the fields */
fields = array_get(&ctx->fields, &count);
for (i = 0; i < count; i++) {
if (i > 0)
str_append(str, "%20");
if ((fields[i].flags & FTS_LOOKUP_FLAG_INVERT) != 0)
str_append_c(str, '-');
if ((fields[i].flags & FTS_LOOKUP_FLAG_HEADER) == 0) {
/* body only */
i_assert((fields[i].flags & FTS_LOOKUP_FLAG_BODY) != 0);
str_append(str, "body:");
} else if ((fields[i].flags & FTS_LOOKUP_FLAG_BODY) == 0) {
/* header only */
str_append(str, "hdr:");
} else {
/* both */
str_append(str, "any:");
}
solr_quote_str(str, fields[i].key);
}
/* use a separate filter query for selecting the mailbox. it shouldn't
affect the score and there could be some caching benefits too. */
str_printfa(str, "&fq=uidv:%u%%20box:", status.uidvalidity);
solr_quote_str(str, box->name);
str_append(str, "%20user:");
solr_quote_str(str, box->storage->ns->user->username);
array_clear(maybe_uids);
return solr_connection_select(solr_conn, str_c(str),
definite_uids, scores);
}
struct fts_backend fts_backend_solr = {
MEMBER(name) "solr",
MEMBER(flags) 0,
{
fts_backend_solr_init,
fts_backend_solr_deinit,
fts_backend_solr_get_last_uid,
fts_backend_solr_build_init,
fts_backend_solr_build_more,
fts_backend_solr_build_deinit,
fts_backend_solr_expunge,
fts_backend_solr_expunge_finish,
fts_backend_solr_lock,
fts_backend_solr_unlock,
NULL,
NULL,
fts_backend_solr_lookup
}
};