fts-backend-solr-old.c revision c215ca02d468b0e542523df1ed18e5f2d7e63968
02c335c23bf5fa225a467c19f2c063fb0dc7b8c3Timo Sirainen/* Copyright (c) 2006-2013 Dovecot authors, see the included COPYING file */
e925f007930955f10fa8728509a44f7a53d2b13eTimo Sirainen /* Valid characters in XML:
e925f007930955f10fa8728509a44f7a53d2b13eTimo Sirainen #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
e925f007930955f10fa8728509a44f7a53d2b13eTimo Sirainen [#x10000-#x10FFFF]
e925f007930955f10fa8728509a44f7a53d2b13eTimo Sirainen This function gets called only for #x80 and higher */
90c23747727c85f80e4e8eed7968f0edbeac7ac5Timo Sirainenxml_encode_data(string_t *dest, const unsigned char *data, unsigned int len)
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen unsigned int i;
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen for (i = 0; i < len; i++) {
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen switch (data[i]) {
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen /* exceptions to the following control char check */
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen /* SOLR doesn't like control characters.
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen replace them with spaces. */
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen /* make sure the character is valid for XML
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen so we don't get XML parser errors */
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen unsigned int char_len =
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen uni_utf8_get_char_n(data + i, char_len, &chr) == 1 &&
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenstatic void xml_encode(string_t *dest, const char *str)
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen xml_encode_data(dest, (const unsigned char *)str, strlen(str));
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenstatic const char *solr_escape_id_str(const char *str)
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen const char *p;
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen if (*p == '\0')
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen switch (*p) {
85144b5f0bc763de14c7d87291a90ef74ac241a2Timo Sirainenstatic void solr_quote(string_t *dest, const char *str)
85144b5f0bc763de14c7d87291a90ef74ac241a2Timo Sirainenstatic void solr_quote_http(string_t *dest, const char *str)
95a1a5195d56f3cf5d1e529aad668f87ad3b979bTimo Sirainenstatic void fts_solr_set_default_ns(struct solr_fts_backend *backend)
85144b5f0bc763de14c7d87291a90ef74ac241a2Timo Sirainen struct mail_namespace *ns = backend->backend.ns;
85144b5f0bc763de14c7d87291a90ef74ac241a2Timo Sirainen struct fts_solr_user *fuser = FTS_SOLR_USER_CONTEXT(ns->user);
85144b5f0bc763de14c7d87291a90ef74ac241a2Timo Sirainen const struct fts_solr_settings *set = &fuser->set;
0bf3eac1110a902e7ec7e695c64e8e46c114e623Timo Sirainen const char *str;
85144b5f0bc763de14c7d87291a90ef74ac241a2Timo Sirainen mail_namespace_find_prefix(ns->user->namespaces,
85144b5f0bc763de14c7d87291a90ef74ac241a2Timo Sirainen i_error("fts_solr: default_ns setting points to "
85144b5f0bc763de14c7d87291a90ef74ac241a2Timo Sirainen "nonexistent namespace");
f153a2cec0319f549388d28f8cfd4d50229d1132Timo Sirainen mail_namespace_find_inbox(ns->user->namespaces);
f153a2cec0319f549388d28f8cfd4d50229d1132Timo Sirainen while (backend->default_ns->alias_for != NULL)
f153a2cec0319f549388d28f8cfd4d50229d1132Timo Sirainen backend->default_ns = backend->default_ns->alias_for;
85144b5f0bc763de14c7d87291a90ef74ac241a2Timo Sirainenstatic void fts_box_name_get_root(struct mail_namespace **ns, const char **name)
85144b5f0bc763de14c7d87291a90ef74ac241a2Timo Sirainen ((*ns)->flags & NAMESPACE_FLAG_INBOX_USER) != 0) {
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen /* ugly workaround to allow selecting INBOX from a Maildir/
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen when it's not in the inbox=yes namespace. */
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenstatic const char *
63e2edd14ae7b1dc4a08e2e659501dbf519462f9Timo Sirainenfts_box_get_root(struct mailbox *box, struct mail_namespace **ns_r)
63e2edd14ae7b1dc4a08e2e659501dbf519462f9Timo Sirainen struct mail_namespace *ns = mailbox_get_namespace(box);
63e2edd14ae7b1dc4a08e2e659501dbf519462f9Timo Sirainen if (t_imap_utf8_to_utf7(box->name, &name) < 0)
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenstatic struct fts_backend *fts_backend_solr_alloc(void)
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenfts_backend_solr_init(struct fts_backend *_backend, const char **error_r)
40ad2c4902e9d83557f2e8a4bff3d98fea2c8aa1Timo Sirainen struct solr_fts_backend *backend = (struct solr_fts_backend *)_backend;
aef92409cf369afdd2ecd81a4f80083cd4082f46Timo Sirainen struct fts_solr_user *fuser = FTS_SOLR_USER_CONTEXT(_backend->ns->user);
aef92409cf369afdd2ecd81a4f80083cd4082f46Timo Sirainen const char *str;
14c474d9f4591c397ed0b5206af6537c7b52c924Timo Sirainen if (solr_connection_init(fuser->set.url, fuser->set.debug,
2d49f150b4bce6f2f59a84e268e4777901c3e42cTimo Sirainen str = solr_escape_id_str(_backend->ns->user->username);
8d131435ba4648c8821160ec38d508c97177c715Timo Sirainenstatic void fts_backend_solr_deinit(struct fts_backend *_backend)
8d131435ba4648c8821160ec38d508c97177c715Timo Sirainen struct solr_fts_backend *backend = (struct solr_fts_backend *)_backend;
1ff03d2244dd08effadfecec365bec28793a1aa0Timo Sirainensolr_add_ns_query(string_t *str, struct solr_fts_backend *backend,
b5b3b4c9159f506cdfdce7399faaeeffdf73faf7Timo Sirainen if (ns == backend->default_ns || *ns->prefix == '\0') {
e64d7b6f388fecd0c83a4f2acb54e30d5ac98c6cTimo Sirainensolr_add_ns_query_http(string_t *str, struct solr_fts_backend *backend,
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenfts_backend_solr_get_last_uid_fallback(struct solr_fts_backend *backend,
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen unsigned int count;
bdd7a96c363346f7c38f389791be1487ca08775bTimo Sirainen str_append(str, "fl=uid&rows=1&sort=uid+desc&q=");
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen mailbox_get_open_status(box, STATUS_UIDVALIDITY, &status);
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen str_printfa(str, "uidv:%u+box:", status.uidvalidity);
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen pool = pool_alloconly_create("solr last uid lookup", 1024);
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen if (solr_connection_select(backend->solr_conn, str_c(str),
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen /* no UIDs */
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen uidvals = array_get(&results[0]->uids, &count);
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen if (count == 1 && uidvals[0].seq1 == uidvals[0].seq2) {
bdd7a96c363346f7c38f389791be1487ca08775bTimo Sirainen i_error("fts_solr: Last UID lookup returned multiple rows");
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenfts_backend_solr_get_last_uid(struct fts_backend *_backend,
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen /* either nothing has been indexed, or the index was corrupted.
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen do it the slow way. */
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen if (fts_backend_solr_get_last_uid_fallback(backend, box, last_uid_r) < 0)
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenfts_backend_solr_update_init(struct fts_backend *_backend)
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen ctx = i_new(struct solr_fts_backend_update_context, 1);
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen ctx->cmd = str_new(default_pool, SOLR_CMDBUF_SIZE);
1deffbbcd7affd2ec89284f3c644cc73db0a2b90Timo Sirainenstatic void xml_encode_id(struct solr_fts_backend_update_context *ctx,
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenfts_backend_solr_add_doc_prefix(struct solr_fts_backend_update_context *ctx,
d9fda7e3a0fa5551547ac3e3054b837fc77f4bfbTimo Sirainen "<field name=\"uid\">%u</field>"
d9fda7e3a0fa5551547ac3e3054b837fc77f4bfbTimo Sirainen "<field name=\"uidv\">%u</field>",
2f8da04d700cc23fcd6630226a4866e828b761bdTimo Sirainen str_append(ctx->cmd, "</field><field name=\"user\">");
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenfts_backed_solr_build_commit(struct solr_fts_backend_update_context *ctx)
aef92409cf369afdd2ecd81a4f80083cd4082f46Timo Sirainen solr_connection_post_more(ctx->post, str_data(ctx->cmd),
9061a2a9a7f8da780a5b50af3603f828167c6b13Timo Sirainenfts_backend_solr_update_deinit(struct fts_backend_update_context *_ctx)
9061a2a9a7f8da780a5b50af3603f828167c6b13Timo Sirainen (struct solr_fts_backend_update_context *)_ctx;
63e2edd14ae7b1dc4a08e2e659501dbf519462f9Timo Sirainen const char *str;
9061a2a9a7f8da780a5b50af3603f828167c6b13Timo Sirainen /* commit and wait until the documents we just indexed are
9061a2a9a7f8da780a5b50af3603f828167c6b13Timo Sirainen visible to the following search */
9061a2a9a7f8da780a5b50af3603f828167c6b13Timo Sirainen str = t_strdup_printf("<commit waitFlush=\"false\" "
9061a2a9a7f8da780a5b50af3603f828167c6b13Timo Sirainen "waitSearcher=\"%s\"/>",
aef92409cf369afdd2ecd81a4f80083cd4082f46Timo Sirainen if (solr_connection_post(backend->solr_conn, str) < 0)
9061a2a9a7f8da780a5b50af3603f828167c6b13Timo Sirainenfts_backend_solr_update_set_mailbox(struct fts_backend_update_context *_ctx,
9061a2a9a7f8da780a5b50af3603f828167c6b13Timo Sirainen (struct solr_fts_backend_update_context *)_ctx;
4fcee755029b42c1f31227211290fa5047c00075Timo Sirainen fts_index_set_last_uid(ctx->cur_box, ctx->prev_uid);
4fcee755029b42c1f31227211290fa5047c00075Timo Sirainen ctx->id_box_name = i_strdup(fts_box_get_root(box, &ns));
4fcee755029b42c1f31227211290fa5047c00075Timo Sirainen mailbox_get_open_status(box, STATUS_UIDVALIDITY, &status);
4fcee755029b42c1f31227211290fa5047c00075Timo Sirainenfts_backend_solr_update_expunge(struct fts_backend_update_context *_ctx,
4fcee755029b42c1f31227211290fa5047c00075Timo Sirainen (struct solr_fts_backend_update_context *)_ctx;
2a6af811ea3de3cf9e2f15e446674dd21b0705f3Timo Sirainen (void)solr_connection_post(backend->solr_conn, str_c(cmd));
aef92409cf369afdd2ecd81a4f80083cd4082f46Timo Sirainenfts_backend_solr_uid_changed(struct solr_fts_backend_update_context *ctx,
aef92409cf369afdd2ecd81a4f80083cd4082f46Timo Sirainen ctx->post = solr_connection_post_begin(backend->solr_conn);
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenfts_backend_solr_update_set_build_key(struct fts_backend_update_context *_ctx,
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen (struct solr_fts_backend_update_context *)_ctx;
94aa90d2d17a7aebcda5a4193a62e80ddbb169b7Timo Sirainen str_append(ctx->cmd, "<field name=\"body\">");
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenfts_backend_solr_update_unset_build_key(struct fts_backend_update_context *_ctx)
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen (struct solr_fts_backend_update_context *)_ctx;
90c23747727c85f80e4e8eed7968f0edbeac7ac5Timo Sirainenfts_backend_solr_update_build_more(struct fts_backend_update_context *_ctx,
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen (struct solr_fts_backend_update_context *)_ctx;
95a1a5195d56f3cf5d1e529aad668f87ad3b979bTimo Sirainen if (str_len(ctx->cmd) > SOLR_CMDBUF_SIZE-128) {
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen solr_connection_post_more(ctx->post, str_data(ctx->cmd),
2a6af811ea3de3cf9e2f15e446674dd21b0705f3Timo Sirainenstatic int fts_backend_solr_refresh(struct fts_backend *backend ATTR_UNUSED)
2a6af811ea3de3cf9e2f15e446674dd21b0705f3Timo Sirainenstatic int fts_backend_solr_optimize(struct fts_backend *backend ATTR_UNUSED)
2a6af811ea3de3cf9e2f15e446674dd21b0705f3Timo Sirainensolr_add_definite_query(string_t *str, struct mail_search_arg *arg)
47e0598840ecffa364ebed523e06939e22738f06Timo Sirainensolr_add_definite_query_args(string_t *str, struct mail_search_arg *arg,
47e0598840ecffa364ebed523e06939e22738f06Timo Sirainenfts_backend_solr_lookup(struct fts_backend *_backend, struct mailbox *box,
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen mailbox_get_open_status(box, STATUS_UIDVALIDITY | STATUS_UIDNEXT,
1bea995196e46157e495a78b8f93780c576b3ef8Timo Sirainen str_printfa(str, "fl=uid,score&rows=%u&sort=uid+asc&q=",
bd503f12eb667df389a99162f567bd8785798f55Timo Sirainen if (!solr_add_definite_query_args(str, args, and_args)) {
bd503f12eb667df389a99162f567bd8785798f55Timo Sirainen /* can't search this query */
b780aa272b742a43579cdb523cc79cc8d4521306Timo Sirainen /* use a separate filter query for selecting the mailbox. it shouldn't
04b8a90af181cc4c7959266855e8ed50a22ed413Timo Sirainen affect the score and there could be some caching benefits too. */
04b8a90af181cc4c7959266855e8ed50a22ed413Timo Sirainen solr_quote_http(str, box->storage->user->username);
04b8a90af181cc4c7959266855e8ed50a22ed413Timo Sirainen str_printfa(str, "+%%2Buidv:%u+%%2Bbox:", status.uidvalidity);
2a6af811ea3de3cf9e2f15e446674dd21b0705f3Timo Sirainen pool = pool_alloconly_create("fts solr search", 1024);
2a6af811ea3de3cf9e2f15e446674dd21b0705f3Timo Sirainen ret = solr_connection_select(backend->solr_conn, str_c(str),
ae8817f05005f57bba32479a610b52d083e2b6ebTimo Sirainen array_append_array(&result->definite_uids, &results[0]->uids);
ae8817f05005f57bba32479a610b52d083e2b6ebTimo Sirainen array_append_array(&result->scores, &results[0]->scores);
2a6af811ea3de3cf9e2f15e446674dd21b0705f3Timo Sirainenmailbox_get_id(struct solr_fts_backend *backend, struct mail_namespace *ns,
2a6af811ea3de3cf9e2f15e446674dd21b0705f3Timo Sirainensolr_search_multi(struct solr_fts_backend *backend, string_t *str,
6cb2c6ecddcdbeac9e6c73a292244747e12a793eTimo Sirainen HASH_TABLE(char *, struct mailbox *) mailboxes;
2a6af811ea3de3cf9e2f15e446674dd21b0705f3Timo Sirainen unsigned int i, len;
029cfcdce65b284d5230adf1c920a5f526b03b5cTimo Sirainen /* use a separate filter query for selecting the mailbox. it shouldn't
2a6af811ea3de3cf9e2f15e446674dd21b0705f3Timo Sirainen affect the score and there could be some caching benefits too. */
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen solr_quote_http(str, backend->backend.ns->owner->username);
47e0598840ecffa364ebed523e06939e22738f06Timo Sirainen hash_table_create(&mailboxes, default_pool, 0, str_hash, strcmp);
bf8f4f90cb5e5f32c2611ba3425557964b9c47fcTimo Sirainen mailbox_get_open_status(boxes[i], STATUS_UIDVALIDITY, &status);
ae8817f05005f57bba32479a610b52d083e2b6ebTimo Sirainen str_printfa(str, "%%2B(%%2Buidv:%u+%%2Bbox:", status.uidvalidity);
47e0598840ecffa364ebed523e06939e22738f06Timo Sirainen box_id = mailbox_get_id(backend, ns, box_name, status.uidvalidity);
47e0598840ecffa364ebed523e06939e22738f06Timo Sirainen hash_table_insert(mailboxes, box_id, boxes[i]);
e34d170f8f0e084bd94bfbc1a7085ece67e508dfTimo Sirainen if (solr_connection_select(backend->solr_conn, str_c(str),
e34d170f8f0e084bd94bfbc1a7085ece67e508dfTimo Sirainen box = hash_table_lookup(mailboxes, solr_results[i]->box_id);
e34d170f8f0e084bd94bfbc1a7085ece67e508dfTimo Sirainen i_warning("fts_solr: Lookup returned unexpected mailbox "
2a6af811ea3de3cf9e2f15e446674dd21b0705f3Timo Sirainen fts_result = array_append_space(&fts_results);
1bea995196e46157e495a78b8f93780c576b3ef8Timo Sirainen fts_result->definite_uids = solr_results[i]->uids;
629600d9a85e8025c15a5eaeb80329e116e022c9Timo Sirainen result->box_results = array_idx_modifiable(&fts_results, 0);
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainenfts_backend_solr_lookup_multi(struct fts_backend *_backend,
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen str_printfa(str, "fl=ns,box,uidv,uid,score&rows=%u&sort=box+asc,uid+asc&q=",
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen if (solr_add_definite_query_args(str, args, and_args)) {
4bc96ba6f1d67a90a75fa131bcd2cd508ea5a05aTimo Sirainen if (solr_search_multi(backend, str, boxes, result) < 0)
b222354c9553cd60b7dd418885e10c0473f73985Timo Sirainen /* FIXME: maybe_uids could be handled also with some more work.. */