fts-backend-solr.c revision f3b0efdcbd0bd9059574c8f86d6cb43e16c8e521
/* Copyright (c) 2006-2016 Dovecot authors, see the included COPYING file */
#include "lib.h"
#include "array.h"
#include "str.h"
#include "hash.h"
#include "strescape.h"
#include "unichar.h"
#include "http-url.h"
#include "mail-storage-private.h"
#include "mailbox-list-private.h"
#include "mail-search.h"
#include "fts-api.h"
#include "solr-connection.h"
#include "fts-solr-plugin.h"
#include <ctype.h>
#define SOLR_MAX_MULTI_ROWS 100000
/* If header is larger than this, truncate it. */
/* If SOLR_HEADER_MAX_SIZE was already reached, write still to individual
header fields as long as they're smaller than this */
#define SOLR_HEADER_LINE_MAX_TRUNC_SIZE 1024
#define SOLR_QUERY_MAX_MAILBOX_COUNT 10
/* How often to flush indexing request to Solr before beginning a new one. */
#define SOLR_MAIL_FLUSH_INTERVAL 1000
struct solr_fts_backend {
struct fts_backend backend;
struct solr_connection *solr_conn;
};
struct solr_fts_field {
char *key;
};
struct solr_fts_backend_update_context {
struct fts_backend_update_context ctx;
struct solr_connection_post *post;
unsigned int mails_since_flush;
unsigned int tokenized_input:1;
unsigned int last_indexed_uid_set:1;
unsigned int body_open:1;
unsigned int documents_added:1;
unsigned int expunges:1;
unsigned int truncate_header:1;
};
static const char *solr_escape_chars = "+-&|!(){}[]^\"~*?:\\/ ";
{
/* Valid characters in XML:
#x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
[#x10000-#x10FFFF]
This function gets called only for #x80 and higher */
return FALSE;
return FALSE;
return chr < 0x10ffff;
}
static unsigned int
unsigned int max_len)
{
unsigned int i;
for (i = 0; i < max_len; i++) {
switch (data[i]) {
case '&':
break;
case '<':
break;
case '>':
break;
case '\t':
case '\n':
case '\r':
/* exceptions to the following control char check */
break;
default:
if (data[i] < 32) {
/* SOLR doesn't like control characters.
replace them with spaces. */
} else if (data[i] >= 0x80) {
/* make sure the character is valid for XML
so we don't get XML parser errors */
unsigned int char_len =
else {
}
i += char_len - 1;
} else {
}
break;
}
}
return i;
}
static void
{
}
{
}
static const char *solr_escape(const char *str)
{
unsigned int i;
for (i = 0; str[i] != '\0'; i++) {
}
}
{
if (str[0] != '\0')
else
}
static struct fts_backend *fts_backend_solr_alloc(void)
{
struct solr_fts_backend *backend;
}
static int
{
*error_r = "Invalid fts_solr setting";
return -1;
}
/* change our flags so we get proper input */
}
}
{
}
static int
{
const char *box_guid;
unsigned int count;
struct solr_result **results;
int ret = 0;
return -1;
else
ret = -1;
/* no UIDs */
*last_uid_r = 0;
} else {
} else {
i_error("fts_solr: Last UID lookup returned multiple rows");
ret = -1;
}
}
pool_unref(&pool);
return ret;
}
static int
{
struct fts_index_header hdr;
return 0;
}
/* either nothing has been indexed, or the index was corrupted.
do it the slow way. */
return -1;
return 0;
}
static struct fts_backend_update_context *
{
struct solr_fts_backend_update_context *ctx;
}
{
}
}
static void
{
"<field name=\"uid\">%u</field>"
"<field name=\"box\">%s</field>",
}
static string_t *
{
const struct solr_fts_field *field;
struct solr_fts_field new_field;
/* there are only a few fields. this lookup is fast enough. */
}
}
static void
{
struct solr_fts_field *field;
}
}
}
static int
{
return 0;
ctx->mails_since_flush = 0;
}
static void
{
struct solr_fts_backend *backend =
}
static int
{
struct solr_fts_backend_update_context *ctx =
(struct solr_fts_backend_update_context *)_ctx;
struct solr_fts_backend *backend =
struct solr_fts_field *field;
const char *str;
if (fts_backed_solr_build_flush(ctx) < 0)
ret = -1;
/* commit and wait until the documents we just indexed are
visible to the following search */
ret = -1;
}
}
return ret;
}
static void
{
struct solr_fts_backend_update_context *ctx =
(struct solr_fts_backend_update_context *)_ctx;
const char *box_guid;
/* flush solr between mailboxes, so we don't wrongly update
last_uid before we know it has succeeded */
if (fts_backed_solr_build_flush(ctx) < 0)
}
} else {
}
}
static void
{
struct solr_fts_backend_update_context *ctx =
(struct solr_fts_backend_update_context *)_ctx;
struct fts_index_header hdr;
if (!ctx->last_indexed_uid_set) {
ctx->last_indexed_uid = 0;
else
}
if (ctx->last_indexed_uid == 0 ||
/* don't waste time asking Solr to expunge a message that is
highly unlikely to be indexed at this time. */
return;
}
}
}
static void
{
struct solr_fts_backend *backend =
if (fts_backed_solr_build_flush(ctx) < 0)
}
} else {
}
}
static bool
const struct fts_backend_build_key *key)
{
struct solr_fts_backend_update_context *ctx =
(struct solr_fts_backend_update_context *)_ctx;
ctx->cur_value2 =
}
/* fall through */
break;
}
break;
i_unreached();
}
return TRUE;
}
static void
{
struct solr_fts_backend_update_context *ctx =
(struct solr_fts_backend_update_context *)_ctx;
/* There can be multiple duplicate keys (duplicate header lines,
multiple MIME body parts). Make sure they are separated by
whitespace. */
}
}
static int
{
struct solr_fts_backend_update_context *ctx =
(struct solr_fts_backend_update_context *)_ctx;
unsigned int len;
return -1;
/* we're writing to message body. if size is huge,
flush it once in a while */
while (size >= SOLR_CMDBUF_FLUSH_SIZE) {
}
}
if (ctx->tokenized_input)
} else {
if (!ctx->truncate_header) {
if (ctx->tokenized_input)
}
(!ctx->truncate_header ||
if (ctx->tokenized_input)
}
}
}
if (!ctx->truncate_header &&
/* a large header */
i_warning("fts-solr(%s): Mailbox %s UID=%u header size is huge, truncating",
}
return 0;
}
{
return 0;
}
{
/* FIXME: proper rescan needed. for now we'll just reset the
last-uids */
return fts_backend_reset_last_uids(backend);
}
{
return 0;
}
static bool solr_need_escaping(const char *str)
{
return TRUE;
}
return FALSE;
}
{
/* currently we'll just disable fuzzy searching if there are any
parameters that need escaping. solr doesn't seem to give good
fuzzy results even if we did escape them.. */
else {
}
}
static bool
{
case SEARCH_TEXT: {
break;
}
case SEARCH_BODY:
break;
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
return FALSE;
break;
default:
return FALSE;
}
return TRUE;
}
static bool
bool and_args)
{
unsigned int last_len;
if (and_args)
else
}
}
return FALSE;
return TRUE;
}
static bool
{
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
return FALSE;
/* all matches would be definite, but all non-matches
would be maybies. too much trouble to optimize. */
return FALSE;
}
/* we can check if the search key exists in some header and
filter out the messages that have no chance of matching */
else {
/* checking potential existence of the header name */
}
break;
default:
return FALSE;
}
return TRUE;
}
static bool
bool and_args)
{
unsigned int last_len;
if (and_args)
else
}
}
return FALSE;
return TRUE;
}
{
struct solr_result **results;
int ret;
/* use a separate filter query for selecting the mailbox. it shouldn't
affect the score and there could be some caching benefits too. */
else
}
pool_unref(&pool);
return ret;
}
static int
struct mail_search_arg *args,
enum fts_lookup_flags flags,
struct fts_result *result)
{
struct mailbox_status status;
const char *box_guid;
unsigned int prefix_len;
return -1;
(flags & FTS_LOOKUP_FLAG_NO_AUTO_FUZZY) == 0 ?
return -1;
}
return -1;
}
return 0;
}
static int
struct fts_multi_result *result)
{
struct solr_result **solr_results;
struct fts_result *fts_result;
const char *box_guid;
unsigned int i, len;
bool search_all_mailboxes;
/* use a separate filter query for selecting the mailbox. it shouldn't
affect the score and there could be some caching benefits too. */
else
if (!search_all_mailboxes)
continue;
if (!search_all_mailboxes) {
}
boxes[i]);
}
if (!search_all_mailboxes)
return -1;
}
for (i = 0; solr_results[i] != NULL; i++) {
if (!search_all_mailboxes) {
i_warning("fts_solr: Lookup returned unexpected mailbox "
}
continue;
}
if ((flags & FTS_LOOKUP_FLAG_NO_AUTO_FUZZY) == 0)
else
}
return 0;
}
static int
struct mail_search_arg *args,
enum fts_lookup_flags flags,
struct fts_multi_result *result)
{
return -1;
}
/* FIXME: maybe_uids could be handled also with some more work.. */
return 0;
}
struct fts_backend fts_backend_solr = {
.name = "solr",
{
}
};