fts-backend-solr.c revision 2d7df7973f80011033e8e9fa676d3ff4c14468d8
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose/* Copyright (c) 2006-2013 Dovecot authors, see the included COPYING file */
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose#define SOLR_CMDBUF_FLUSH_SIZE (SOLR_CMDBUF_SIZE-128)
2cf7becc05996eb6d8a3352d3d7b97c75652e590Sumit Bose /* Valid characters in XML:
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose [#x10000-#x10FFFF]
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose This function gets called only for #x80 and higher */
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosestatic unsigned int
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosexml_encode_data_max(string_t *dest, const unsigned char *data, unsigned int len,
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose unsigned int max_len)
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose unsigned int i;
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose for (i = 0; i < max_len; i++) {
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose switch (data[i]) {
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose /* exceptions to the following control char check */
233a3c6c48972b177e60d6ef4cecfacd3cf31659Simo Sorce /* SOLR doesn't like control characters.
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose replace them with spaces. */
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose /* make sure the character is valid for XML
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose so we don't get XML parser errors */
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose unsigned int char_len =
a3c8390d19593b1e5277d95bfb4ab206d4785150Nikolai Kondrashov uni_utf8_get_char_n(data + i, char_len, &chr) == 1 &&
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosexml_encode_data(string_t *dest, const unsigned char *data, unsigned int len)
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosestatic void xml_encode(string_t *dest, const char *str)
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose xml_encode_data(dest, (const unsigned char *)str, strlen(str));
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosestatic void solr_quote_http(string_t *dest, const char *str)
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bosestatic struct fts_backend *fts_backend_solr_alloc(void)
233a3c6c48972b177e60d6ef4cecfacd3cf31659Simo Sorcefts_backend_solr_init(struct fts_backend *_backend, const char **error_r)
3e9712c2fdbba8f9cd25886943331e76e0b2ceddSumit Bose struct fts_solr_user *fuser = FTS_SOLR_USER_CONTEXT(_backend->ns->user);
3e9712c2fdbba8f9cd25886943331e76e0b2ceddSumit Bose const struct fts_solr_settings *set = &fuser->set;
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosestatic void fts_backend_solr_deinit(struct fts_backend *_backend)
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose struct solr_fts_backend *backend = (struct solr_fts_backend *)_backend;
7ee9ac32485483beece872d6fcb3096fa77a004bSumit Boseget_last_uid_fallback(struct fts_backend *_backend, struct mailbox *box,
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose unsigned int count;
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose str_append(str, "fl=uid&rows=1&sort=uid+desc&q=");
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose solr_quote_http(str, _backend->ns->owner->username);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose pool = pool_alloconly_create("solr last uid lookup", 1024);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose if (solr_connection_select(solr_conn, str_c(str),
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose /* no UIDs */
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose if (count == 1 && uidvals[0].seq1 == uidvals[0].seq2) {
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose i_error("fts_solr: Last UID lookup returned multiple rows");
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosefts_backend_solr_get_last_uid(struct fts_backend *_backend,
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose /* either nothing has been indexed, or the index was corrupted.
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose do it the slow way. */
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose if (get_last_uid_fallback(_backend, box, last_uid_r) < 0)
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosefts_backend_solr_update_init(struct fts_backend *_backend)
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose ctx = i_new(struct solr_fts_backend_update_context, 1);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosestatic void xml_encode_id(struct solr_fts_backend_update_context *ctx,
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose xml_encode(str, ctx->ctx.backend->ns->owner->username);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosefts_backend_solr_doc_open(struct solr_fts_backend_update_context *ctx,
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose "<field name=\"uid\">%u</field>"
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose "<field name=\"box\">%s</field>",
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose xml_encode(ctx->cmd, ctx->ctx.backend->ns->owner->username);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosefts_solr_field_get(struct solr_fts_backend_update_context *ctx, const char *key)
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose /* there are only a few fields. this lookup is fast enough. */
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosefts_backend_solr_doc_close(struct solr_fts_backend_update_context *ctx)
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose str_printfa(ctx->cmd, "<field name=\"%s\">", field->key);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose xml_encode_data(ctx->cmd, str_data(field->value), str_len(field->value));
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosefts_backed_solr_build_commit(struct solr_fts_backend_update_context *ctx)
e2f6326ea56217afab7623c542a237ee84eb74daSumit Bose solr_connection_post_more(ctx->post, str_data(ctx->cmd),
e2f6326ea56217afab7623c542a237ee84eb74daSumit Bosefts_backend_solr_expunge_flush(struct solr_fts_backend_update_context *ctx)
e2f6326ea56217afab7623c542a237ee84eb74daSumit Bose (void)solr_connection_post(solr_conn, str_c(ctx->cmd_expunge));
a3c8390d19593b1e5277d95bfb4ab206d4785150Nikolai Kondrashov str_append(ctx->cmd_expunge, "<delete>");
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosefts_backend_solr_update_deinit(struct fts_backend_update_context *_ctx)
2962b3d1e072ff2ebbe343095812dad697d6bf1dSumit Bose const char *str;
a3c8390d19593b1e5277d95bfb4ab206d4785150Nikolai Kondrashov if (ctx->documents_added || ctx->expunges) {
a3c8390d19593b1e5277d95bfb4ab206d4785150Nikolai Kondrashov /* commit and wait until the documents we just indexed are
e2f6326ea56217afab7623c542a237ee84eb74daSumit Bose visible to the following search */
e2f6326ea56217afab7623c542a237ee84eb74daSumit Bose str = t_strdup_printf("<commit waitSearcher=\"%s\"/>",
e2f6326ea56217afab7623c542a237ee84eb74daSumit Bosefts_backend_solr_update_set_mailbox(struct fts_backend_update_context *_ctx,
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose fts_index_set_last_uid(ctx->cur_box, ctx->prev_uid);
210e57203a1502f78a16b05010d52c9121b644e3Lukas Slebodnik if (fts_mailbox_get_guid(box, &box_guid) < 0)
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose i_assert(strlen(box_guid) == sizeof(ctx->box_guid)-1);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose memcpy(ctx->box_guid, box_guid, sizeof(ctx->box_guid)-1);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosefts_backend_solr_update_expunge(struct fts_backend_update_context *_ctx,
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose /* don't waste time asking Solr to expunge a message that is
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose highly unlikely to be indexed at this time. */
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose if (str_len(ctx->cmd_expunge) >= SOLR_CMDBUF_FLUSH_SIZE)
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosefts_backend_solr_uid_changed(struct solr_fts_backend_update_context *ctx,
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose ctx->cmd = str_new(default_pool, SOLR_CMDBUF_SIZE);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose ctx->post = solr_connection_post_begin(solr_conn);
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bosefts_backend_solr_update_set_build_key(struct fts_backend_update_context *_ctx,
2cf7becc05996eb6d8a3352d3d7b97c75652e590Sumit Bose /* fall through */
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bosefts_backend_solr_update_unset_build_key(struct fts_backend_update_context *_ctx)
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose /* There can be multiple duplicate keys (duplicate header lines,
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose multiple MIME body parts). Make sure they are separated by
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose whitespace. */
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bosefts_backend_solr_update_build_more(struct fts_backend_update_context *_ctx,
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose unsigned int len;
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose if (ctx->cur_value2 == NULL && ctx->cur_value == ctx->cmd) {
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose /* we're writing to message body. if size is huge,
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose flush it once in a while */
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose if (str_len(ctx->cmd) >= SOLR_CMDBUF_FLUSH_SIZE) {
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose if (str_len(ctx->cmd) >= SOLR_CMDBUF_FLUSH_SIZE) {
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose solr_connection_post_more(ctx->post, str_data(ctx->cmd),
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose if (str_len(ctx->cur_value) >= SOLR_BUFFER_WARN_SIZE &&
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose /* a large header */
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose i_warning("fts-solr(%s): Mailbox %s UID=%u header size is huge",
c125e741d3111e2f9b56866ba00835ca05c6f349Jakub Hrozek mailbox_get_vname(ctx->cur_box), ctx->prev_uid);
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bosestatic int fts_backend_solr_refresh(struct fts_backend *backend ATTR_UNUSED)
c125e741d3111e2f9b56866ba00835ca05c6f349Jakub Hrozekstatic int fts_backend_solr_rescan(struct fts_backend *backend)
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose /* FIXME: proper rescan needed. for now we'll just reset the
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose last-uids */
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose iter = mailbox_list_iter_init(backend->ns->list, "*",
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose while ((info = mailbox_list_iter_next(iter)) != NULL) {
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose box = mailbox_alloc(info->ns->list, info->vname, 0);
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bosestatic int fts_backend_solr_optimize(struct fts_backend *backend ATTR_UNUSED)
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose const char *solr_escape_chars = "+-&|!(){}[]^\"~*?:\\ ";
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bosestatic void solr_add_str_arg(string_t *str, struct mail_search_arg *arg)
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose /* currently we'll just disable fuzzy searching if there are any
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose parameters that need escaping. solr doesn't seem to give good
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose fuzzy results even if we did escape them.. */
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose if (!arg->fuzzy || solr_need_escaping(arg->value.str))
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bosesolr_add_definite_query(string_t *str, struct mail_search_arg *arg)
c125e741d3111e2f9b56866ba00835ca05c6f349Jakub Hrozek if (!fts_header_want_indexed(arg->hdr_field_name))
c125e741d3111e2f9b56866ba00835ca05c6f349Jakub Hrozek str_append(str, t_str_lcase(arg->hdr_field_name));
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bosesolr_add_definite_query_args(string_t *str, struct mail_search_arg *arg,
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose unsigned int last_len;
c125e741d3111e2f9b56866ba00835ca05c6f349Jakub Hrozeksolr_add_maybe_query(string_t *str, struct mail_search_arg *arg)
c125e741d3111e2f9b56866ba00835ca05c6f349Jakub Hrozek if (fts_header_want_indexed(arg->hdr_field_name))
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose /* all matches would be definite, but all non-matches
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose would be maybies. too much trouble to optimize. */
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose /* we can check if the search key exists in some header and
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose filter out the messages that have no chance of matching */
abee3216261e3378430e472f0c992470b33976f0Sumit Bose /* checking potential existence of the header name */
abee3216261e3378430e472f0c992470b33976f0Sumit Bose solr_quote_http(str, t_str_lcase(arg->hdr_field_name));
c125e741d3111e2f9b56866ba00835ca05c6f349Jakub Hrozeksolr_add_maybe_query_args(string_t *str, struct mail_search_arg *arg,
c125e741d3111e2f9b56866ba00835ca05c6f349Jakub Hrozek unsigned int last_len;
abee3216261e3378430e472f0c992470b33976f0Sumit Bosestatic int solr_search(struct fts_backend *_backend, string_t *str,
abee3216261e3378430e472f0c992470b33976f0Sumit Bose const char *box_guid, ARRAY_TYPE(seq_range) *uids_r,
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose pool_t pool = pool_alloconly_create("fts solr search", 1024);
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose /* use a separate filter query for selecting the mailbox. it shouldn't
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose affect the score and there could be some caching benefits too. */
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose str_printfa(str, "&fq=%%2Bbox:%s+%%2Buser:", box_guid);
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose solr_quote_http(str, _backend->ns->owner->username);
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose ret = solr_connection_select(solr_conn, str_c(str), pool, &results);
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose array_append_array(scores_r, &results[0]->scores);
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bosefts_backend_solr_lookup(struct fts_backend *_backend, struct mailbox *box,
415d93196533a6fcd90889c67396ef5af5bf791aSumit Bose unsigned int prefix_len;
c125e741d3111e2f9b56866ba00835ca05c6f349Jakub Hrozek mailbox_get_open_status(box, STATUS_UIDNEXT, &status);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose str_printfa(str, "fl=uid,score&rows=%u&sort=uid+asc&q=",
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose if (solr_add_definite_query_args(str, args, and_args)) {
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose if (solr_add_maybe_query_args(str, args, and_args)) {
a3c8390d19593b1e5277d95bfb4ab206d4785150Nikolai Kondrashovsolr_search_multi(struct fts_backend *_backend, string_t *str,
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose unsigned int i, len;
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose /* use a separate filter query for selecting the mailbox. it shouldn't
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose affect the score and there could be some caching benefits too. */
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose solr_quote_http(str, _backend->ns->owner->username);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose hash_table_create(&mailboxes, default_pool, 0, str_hash, strcmp);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose if (fts_mailbox_get_guid(boxes[i], &box_guid) < 0)
a3c8390d19593b1e5277d95bfb4ab206d4785150Nikolai Kondrashov hash_table_insert(mailboxes, t_strdup_noconst(box_guid),
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose if (solr_connection_select(solr_conn, str_c(str),
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bose box = hash_table_lookup(mailboxes, solr_results[i]->box_id);
ac7a7ee3d1e138818a1ed78758f7dd3c3306a56bSumit Bose i_warning("fts_solr: Lookup returned unexpected mailbox "
c125e741d3111e2f9b56866ba00835ca05c6f349Jakub Hrozek fts_result->definite_uids = solr_results[i]->uids;
c125e741d3111e2f9b56866ba00835ca05c6f349Jakub Hrozek result->box_results = array_idx_modifiable(&fts_results, 0);
36a12aea020a935ffa40505fa02860c3d921ad0cSumit Bosefts_backend_solr_lookup_multi(struct fts_backend *backend,
c125e741d3111e2f9b56866ba00835ca05c6f349Jakub Hrozek str_printfa(str, "fl=box,uid,score&rows=%u&sort=box+asc,uid+asc&q=",
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose if (solr_add_definite_query_args(str, args, and_args)) {
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose if (solr_search_multi(backend, str, boxes, result) < 0)
28c70f003c7b330ab1d998a4eff1248d272a6ba9Sumit Bose /* FIXME: maybe_uids could be handled also with some more work.. */