fts-backend-solr.c revision 4128fe33fde3cf20665650bb2b11a6450c09a816
2454dfa32c93c20a8522c6ed42fe057baaac9f9aStephan Bosch/* Copyright (c) 2006-2012 Dovecot authors, see the included COPYING file */
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen#define SOLR_CMDBUF_FLUSH_SIZE (SOLR_CMDBUF_SIZE-128)
4542c94adb8910e0174c784754e737cec16af59cTimo Sirainenstatic struct solr_connection *solr_conn = NULL;
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen /* Valid characters in XML:
e3367d7b54864d2e4b1931903e3f660ae64fbe3aTimo Sirainen #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
e3367d7b54864d2e4b1931903e3f660ae64fbe3aTimo Sirainen [#x10000-#x10FFFF]
e3367d7b54864d2e4b1931903e3f660ae64fbe3aTimo Sirainen This function gets called only for #x80 and higher */
e3367d7b54864d2e4b1931903e3f660ae64fbe3aTimo Sirainenstatic unsigned int
e3367d7b54864d2e4b1931903e3f660ae64fbe3aTimo Sirainenxml_encode_data_max(string_t *dest, const unsigned char *data, unsigned int len,
e3367d7b54864d2e4b1931903e3f660ae64fbe3aTimo Sirainen unsigned int max_len)
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen unsigned int i;
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen for (i = 0; i < max_len; i++) {
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen switch (data[i]) {
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen /* exceptions to the following control char check */
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen /* SOLR doesn't like control characters.
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen replace them with spaces. */
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen /* make sure the character is valid for XML
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen so we don't get XML parser errors */
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen unsigned int char_len =
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen uni_utf8_get_char_n(data + i, char_len, &chr) == 1 &&
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainenxml_encode_data(string_t *dest, const unsigned char *data, unsigned int len)
d979c1179d55ad86e40f869e48ef3e4db9c817b5Timo Sirainen (void)xml_encode_data_max(dest, data, len, len);
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainenstatic void xml_encode(string_t *dest, const char *str)
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen xml_encode_data(dest, (const unsigned char *)str, strlen(str));
d979c1179d55ad86e40f869e48ef3e4db9c817b5Timo Sirainenstatic void solr_quote_http(string_t *dest, const char *str)
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen solr_connection_http_escape(solr_conn, dest, str);
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainenstatic struct fts_backend *fts_backend_solr_alloc(void)
13e130c3af3032982de6b1d13c6dcddda9164848Timo Sirainenfts_backend_solr_init(struct fts_backend *_backend,
c115c742f730e312d6b6ab5064595cd0d8b4e26eTimo Sirainen struct fts_solr_user *fuser = FTS_SOLR_USER_CONTEXT(_backend->ns->user);
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen const struct fts_solr_settings *set = &fuser->set;
c115c742f730e312d6b6ab5064595cd0d8b4e26eTimo Sirainen solr_conn = solr_connection_init(set->url, set->debug);
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainenstatic void fts_backend_solr_deinit(struct fts_backend *_backend)
c115c742f730e312d6b6ab5064595cd0d8b4e26eTimo Sirainen struct solr_fts_backend *backend = (struct solr_fts_backend *)_backend;
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainenget_last_uid_fallback(struct fts_backend *_backend, struct mailbox *box,
8eba883232f80178b60fa416f73292bf5f990fecTimo Sirainen unsigned int count;
8eba883232f80178b60fa416f73292bf5f990fecTimo Sirainen str_append(str, "fl=uid&rows=1&sort=uid+desc&q=");
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen solr_quote_http(str, _backend->ns->owner->username);
a249dd267f05d349f1b4aa27b40a56083c8ba392Timo Sirainen pool = pool_alloconly_create("solr last uid lookup", 1024);
03010dbaa74ec70f062994dfe3cd39bedc99a28bTimo Sirainen if (solr_connection_select(solr_conn, str_c(str),
5f44975ec6c5755dd74bcd4c47a123a7242ecab3Timo Sirainen /* no UIDs */
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen uidvals = array_get(&results[0]->uids, &count);
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen if (count == 1 && uidvals[0].seq1 == uidvals[0].seq2) {
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen i_error("fts_solr: Last UID lookup returned multiple rows");
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainenfts_backend_solr_get_last_uid(struct fts_backend *_backend,
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen /* either nothing has been indexed, or the index was corrupted.
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen do it the slow way. */
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen if (get_last_uid_fallback(_backend, box, last_uid_r) < 0)
e15b305e90c9834734ccf35ed78f0ad29d570ee9Timo Sirainen (void)fts_index_set_last_uid(box, *last_uid_r);
e15b305e90c9834734ccf35ed78f0ad29d570ee9Timo Sirainenfts_backend_solr_update_init(struct fts_backend *_backend)
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen ctx = i_new(struct solr_fts_backend_update_context, 1);
e15b305e90c9834734ccf35ed78f0ad29d570ee9Timo Sirainen ctx->cmd = str_new(default_pool, SOLR_CMDBUF_SIZE);
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainenstatic void xml_encode_id(struct solr_fts_backend_update_context *ctx,
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen str_printfa(str, "%u/%s", uid, ctx->box_guid);
421d30619384e72a27e2a5d13ff6525aff4d17feTimo Sirainen xml_encode(str, ctx->ctx.backend->ns->owner->username);
ecd69c4e8371853667e01b0c16d436ef7f7393e2Timo Sirainenfts_backend_solr_doc_open(struct solr_fts_backend_update_context *ctx,
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen "<field name=\"uid\">%u</field>"
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen "<field name=\"box\">%s</field>",
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen str_append(ctx->cmd, "<field name=\"user\">");
a34bd633ab201f6a5ad1c00174fb8b0359031d00Timo Sirainen xml_encode(ctx->cmd, ctx->ctx.backend->ns->owner->username);
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainenfts_solr_field_get(struct solr_fts_backend_update_context *ctx, const char *key)
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen /* there are only a few fields. this lookup is fast enough. */
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainenfts_backend_solr_doc_close(struct solr_fts_backend_update_context *ctx)
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen array_foreach_modifiable(&ctx->fields, field) {
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen str_printfa(ctx->cmd, "<field name=\"%s\">", field->key);
942302b0247403645394d848b3c620ead262a2a5Timo Sirainenfts_backed_solr_build_commit(struct solr_fts_backend_update_context *ctx)
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen solr_connection_post_more(ctx->post, str_data(ctx->cmd),
942302b0247403645394d848b3c620ead262a2a5Timo Sirainenfts_backend_solr_update_deinit(struct fts_backend_update_context *_ctx)
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen (struct solr_fts_backend_update_context *)_ctx;
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen const char *str;
942302b0247403645394d848b3c620ead262a2a5Timo Sirainen /* commit and wait until the documents we just indexed are
942302b0247403645394d848b3c620ead262a2a5Timo Sirainen visible to the following search */
942302b0247403645394d848b3c620ead262a2a5Timo Sirainen str = t_strdup_printf("<commit waitFlush=\"false\" "
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen "waitSearcher=\"%s\"/>",
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen array_foreach_modifiable(&ctx->fields, field) {
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainenfts_backend_solr_update_set_mailbox(struct fts_backend_update_context *_ctx,
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen (struct solr_fts_backend_update_context *)_ctx;
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen (void)fts_index_set_last_uid(ctx->cur_box, ctx->prev_uid);
942302b0247403645394d848b3c620ead262a2a5Timo Sirainen i_assert(strlen(box_guid) == sizeof(ctx->box_guid)-1);
45d47bc4660fe4bbb07817c9580deef9cca63646Timo Sirainen memcpy(ctx->box_guid, box_guid, sizeof(ctx->box_guid)-1);
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen memset(ctx->box_guid, 0, sizeof(ctx->box_guid));
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainenfts_backend_solr_update_expunge(struct fts_backend_update_context *_ctx,
a34bd633ab201f6a5ad1c00174fb8b0359031d00Timo Sirainen (struct solr_fts_backend_update_context *)_ctx;
942302b0247403645394d848b3c620ead262a2a5Timo Sirainen if (!fts_index_get_header(ctx->cur_box, &hdr))
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen /* don't waste time asking Solr to expunge a message that is
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen highly unlikely to be indexed at this time. */
942302b0247403645394d848b3c620ead262a2a5Timo Sirainen (void)solr_connection_post(solr_conn, str_c(cmd));
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainenfts_backend_solr_uid_changed(struct solr_fts_backend_update_context *ctx,
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen ctx->post = solr_connection_post_begin(solr_conn);
a34bd633ab201f6a5ad1c00174fb8b0359031d00Timo Sirainenfts_backend_solr_update_set_build_key(struct fts_backend_update_context *_ctx,
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen /* fall through */
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen ctx->cur_value = fts_solr_field_get(ctx, "hdr");
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen str_append(ctx->cmd, "<field name=\"body\">");
9af6cc9ebc9986c1275ebdfa29c39e152af1557eTimo Sirainenfts_backend_solr_update_unset_build_key(struct fts_backend_update_context *_ctx)
21aaa6affb9f134112b75b5db737309fc35ef1cfMartti Rannanjärvi struct solr_fts_backend_update_context *ctx =
9af6cc9ebc9986c1275ebdfa29c39e152af1557eTimo Sirainen (struct solr_fts_backend_update_context *)_ctx;
21aaa6affb9f134112b75b5db737309fc35ef1cfMartti Rannanjärvi /* There can be multiple duplicate keys (duplicate header lines,
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen multiple MIME body parts). Make sure they are separated by
21aaa6affb9f134112b75b5db737309fc35ef1cfMartti Rannanjärvi whitespace. */
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainenfts_backend_solr_update_build_more(struct fts_backend_update_context *_ctx,
225e82df5dd1e765f4e52b80c954558f00e5a7dfTimo Sirainen (struct solr_fts_backend_update_context *)_ctx;
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen unsigned int len;
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen if (ctx->cur_value2 == NULL && ctx->cur_value == ctx->cmd) {
a34bd633ab201f6a5ad1c00174fb8b0359031d00Timo Sirainen /* we're writing to message body. if size is huge,
18a41cbd38f83429b790414c1159c097af4a59b8Timo Sirainen flush it once in a while */
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen if (str_len(ctx->cmd) >= SOLR_CMDBUF_FLUSH_SIZE) {
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen len = xml_encode_data_max(ctx->cmd, data, size,
421d30619384e72a27e2a5d13ff6525aff4d17feTimo Sirainen if (str_len(ctx->cmd) >= SOLR_CMDBUF_FLUSH_SIZE) {
ecd69c4e8371853667e01b0c16d436ef7f7393e2Timo Sirainen solr_connection_post_more(ctx->post, str_data(ctx->cmd),
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen if (str_len(ctx->cur_value) >= SOLR_BUFFER_WARN_SIZE &&
f46885a5b78b15a8d2419f6e5d13b643bd85e41fTimo Sirainen /* a large header */
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen i_warning("fts-solr(%s): Mailbox %s UID=%u header size is huge",
24e5e4526d8f5cbc056ab97fd0d154d0936d7a5eTimo Sirainen mailbox_get_vname(ctx->cur_box), ctx->prev_uid);
306b3f41b05da642d87e7ca7a1496efce9f5902fTimo Sirainenstatic int fts_backend_solr_refresh(struct fts_backend *backend ATTR_UNUSED)
ad48319996942463675b53877092ab7e13a7a75aTimo Sirainenstatic int fts_backend_solr_rescan(struct fts_backend *backend)
return ret;
return TRUE;
return FALSE;
case SEARCH_TEXT: {
case SEARCH_BODY:
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
return FALSE;
return FALSE;
return TRUE;
bool and_args)
unsigned int last_len;
if (and_args)
return FALSE;
return TRUE;
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
return FALSE;
return FALSE;
return FALSE;
return TRUE;
bool and_args)
unsigned int last_len;
if (and_args)
return FALSE;
return TRUE;
int ret;
return ret;
const char *box_guid;
unsigned int prefix_len;
const char *box_guid;
unsigned int i, len;
boxes[i]);