fts-backend-solr-old.c revision c051fa1a87d352295dbd522a7a90729ba8d6eacf
5f5870385cff47efd2f58e7892f251cf13761528Timo Sirainen/* Copyright (c) 2006-2015 Dovecot authors, see the included COPYING file */
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainenstatic const char *solr_escape_chars = "+-&|!(){}[]^\"~*?:\\/ ";
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen /* Valid characters in XML:
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen [#x10000-#x10FFFF]
c4267cf4c40fb1f866b5958ff122ef836b8c5dfbTimo Sirainen This function gets called only for #x80 and higher */
7bd72e4deca3cbf757dd1ea298486d9f3bc24226Timo Sirainenxml_encode_data(string_t *dest, const unsigned char *data, unsigned int len)
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen unsigned int i;
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen for (i = 0; i < len; i++) {
7bd72e4deca3cbf757dd1ea298486d9f3bc24226Timo Sirainen switch (data[i]) {
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen /* exceptions to the following control char check */
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen /* SOLR doesn't like control characters.
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen replace them with spaces. */
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen /* make sure the character is valid for XML
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen so we don't get XML parser errors */
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen unsigned int char_len =
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainenstatic void xml_encode(string_t *dest, const char *str)
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen xml_encode_data(dest, (const unsigned char *)str, strlen(str));
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainenstatic const char *solr_escape_id_str(const char *str)
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen const char *p;
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen if (*p == '\0')
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen switch (*p) {
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainenstatic const char *solr_escape(const char *str)
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen unsigned int i;
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen return "\"\"";
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen if (strchr(solr_escape_chars, str[i]) != NULL)
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainenstatic void solr_quote(string_t *dest, const char *str)
303a87c31cb4aa198326694e231df53a043e63c7Timo Sirainenstatic void solr_quote_http(string_t *dest, const char *str)
303a87c31cb4aa198326694e231df53a043e63c7Timo Sirainen http_url_escape_param(dest, solr_escape(str));
303a87c31cb4aa198326694e231df53a043e63c7Timo Sirainenstatic void fts_solr_set_default_ns(struct solr_fts_backend *backend)
303a87c31cb4aa198326694e231df53a043e63c7Timo Sirainen struct mail_namespace *ns = backend->backend.ns;
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen struct fts_solr_user *fuser = FTS_SOLR_USER_CONTEXT(ns->user);
f2bd9e507b8befdd95a983f86664febf5c19bd95Timo Sirainen const struct fts_solr_settings *set = &fuser->set;
e2a88d59c0d47d63ce1ad5b1fd95e487124a3fd4Timo Sirainen const char *str;
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen mail_namespace_find_prefix(ns->user->namespaces,
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen i_error("fts_solr: default_ns setting points to "
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen "nonexistent namespace");
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen mail_namespace_find_inbox(ns->user->namespaces);
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen while (backend->default_ns->alias_for != NULL)
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen backend->default_ns = backend->default_ns->alias_for;
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainenstatic void fts_box_name_get_root(struct mail_namespace **ns, const char **name)
c4267cf4c40fb1f866b5958ff122ef836b8c5dfbTimo Sirainen ((*ns)->flags & NAMESPACE_FLAG_INBOX_USER) != 0) {
c4267cf4c40fb1f866b5958ff122ef836b8c5dfbTimo Sirainen /* ugly workaround to allow selecting INBOX from a Maildir/
c4267cf4c40fb1f866b5958ff122ef836b8c5dfbTimo Sirainen when it's not in the inbox=yes namespace. */
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainenstatic const char *
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainenfts_box_get_root(struct mailbox *box, struct mail_namespace **ns_r)
c4267cf4c40fb1f866b5958ff122ef836b8c5dfbTimo Sirainen struct mail_namespace *ns = mailbox_get_namespace(box);
39e6fcc3e8b1ccb13087c232cb6bdea04d1a20a4Timo Sirainen if (t_imap_utf8_to_utf7(box->name, &name) < 0)
fdc557286bc9f92c5f3bb49096ff6e2bcec0ea79Timo Sirainenstatic struct fts_backend *fts_backend_solr_alloc(void)
8039af9679af6fb56116b353fe44f7dd4c08f031Timo Sirainenfts_backend_solr_init(struct fts_backend *_backend, const char **error_r)
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen struct solr_fts_backend *backend = (struct solr_fts_backend *)_backend;
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen struct fts_solr_user *fuser = FTS_SOLR_USER_CONTEXT(_backend->ns->user);
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen const char *str;
28cd2599128e102198758cf6080588305feb6bcdTimo Sirainen if (solr_connection_init(fuser->set.url, fuser->set.debug,
28cd2599128e102198758cf6080588305feb6bcdTimo Sirainen str = solr_escape_id_str(_backend->ns->user->username);
28cd2599128e102198758cf6080588305feb6bcdTimo Sirainenstatic void fts_backend_solr_deinit(struct fts_backend *_backend)
28cd2599128e102198758cf6080588305feb6bcdTimo Sirainen struct solr_fts_backend *backend = (struct solr_fts_backend *)_backend;
28cd2599128e102198758cf6080588305feb6bcdTimo Sirainensolr_add_ns_query(string_t *str, struct solr_fts_backend *backend,
07974f50bd55b06fd6d465f2c0e491794786e2faTimo Sirainen if (ns == backend->default_ns || *ns->prefix == '\0') {
28cd2599128e102198758cf6080588305feb6bcdTimo Sirainensolr_add_ns_query_http(string_t *str, struct solr_fts_backend *backend,
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainenfts_backend_solr_get_last_uid_fallback(struct solr_fts_backend *backend,
5fb3bff645380804c9db2510940c41db6b8fdb01Timo Sirainen unsigned int count;
51327f2489a4e0e615eb9f7d921473cf8512bb79Timo Sirainen str_append(str, "fl=uid&rows=1&sort=uid+desc&q=");
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen mailbox_get_open_status(box, STATUS_UIDVALIDITY, &status);
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen str_printfa(str, "uidv:%u+AND+box:", status.uidvalidity);
bd74402ca1a39ec303075fefb1212d7e18a71531Timo Sirainen pool = pool_alloconly_create("solr last uid lookup", 1024);
bd74402ca1a39ec303075fefb1212d7e18a71531Timo Sirainen if (solr_connection_select(backend->solr_conn, str_c(str),
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen /* no UIDs */
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen uidvals = array_get(&results[0]->uids, &count);
ef11d3930c3602fc86349a4e3a53442df470b601Timo Sirainen if (count == 1 && uidvals[0].seq1 == uidvals[0].seq2) {
17da42c31202b1b3e7e308121ea17d922c24da1bTimo Sirainen i_error("fts_solr: Last UID lookup returned multiple rows");
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainenfts_backend_solr_get_last_uid(struct fts_backend *_backend,
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen /* either nothing has been indexed, or the index was corrupted.
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen do it the slow way. */
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen if (fts_backend_solr_get_last_uid_fallback(backend, box, last_uid_r) < 0)
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainenfts_backend_solr_update_init(struct fts_backend *_backend)
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen ctx = i_new(struct solr_fts_backend_update_context, 1);
3dd0679b6f24be0287cc42d7a60bbf59cdf8b637Timo Sirainen ctx->cmd = str_new(default_pool, SOLR_CMDBUF_SIZE);
c4267cf4c40fb1f866b5958ff122ef836b8c5dfbTimo Sirainenstatic void xml_encode_id(struct solr_fts_backend_update_context *ctx,
8039af9679af6fb56116b353fe44f7dd4c08f031Timo Sirainenfts_backend_solr_add_doc_prefix(struct solr_fts_backend_update_context *ctx,
28cd2599128e102198758cf6080588305feb6bcdTimo Sirainen "<field name=\"uid\">%u</field>"
a2f250a332dfc1e6cd4ffd196c621eb9dbf7b8a1Timo Sirainen "<field name=\"uidv\">%u</field>",
a2f250a332dfc1e6cd4ffd196c621eb9dbf7b8a1Timo Sirainen str_append(ctx->cmd, "</field><field name=\"user\">");
a2f250a332dfc1e6cd4ffd196c621eb9dbf7b8a1Timo Sirainenfts_backed_solr_build_commit(struct solr_fts_backend_update_context *ctx)
923eb3dde28e4d8841c14fd6b4a69635b7070c3eTimo Sirainen solr_connection_post_more(ctx->post, str_data(ctx->cmd),
923eb3dde28e4d8841c14fd6b4a69635b7070c3eTimo Sirainenfts_backend_solr_update_deinit(struct fts_backend_update_context *_ctx)
923eb3dde28e4d8841c14fd6b4a69635b7070c3eTimo Sirainen (struct solr_fts_backend_update_context *)_ctx;
923eb3dde28e4d8841c14fd6b4a69635b7070c3eTimo Sirainen const char *str;
923eb3dde28e4d8841c14fd6b4a69635b7070c3eTimo Sirainen /* commit and wait until the documents we just indexed are
923eb3dde28e4d8841c14fd6b4a69635b7070c3eTimo Sirainen visible to the following search */
923eb3dde28e4d8841c14fd6b4a69635b7070c3eTimo Sirainen str = t_strdup_printf("<commit waitFlush=\"false\" "
923eb3dde28e4d8841c14fd6b4a69635b7070c3eTimo Sirainen "waitSearcher=\"%s\"/>",
923eb3dde28e4d8841c14fd6b4a69635b7070c3eTimo Sirainen if (solr_connection_post(backend->solr_conn, str) < 0)
923eb3dde28e4d8841c14fd6b4a69635b7070c3eTimo Sirainenfts_backend_solr_update_set_mailbox(struct fts_backend_update_context *_ctx,
923eb3dde28e4d8841c14fd6b4a69635b7070c3eTimo Sirainen (struct solr_fts_backend_update_context *)_ctx;
85ee28daca146e18a99a22f46c0d639e57a6ac95Timo Sirainen fts_index_set_last_uid(ctx->cur_box, ctx->prev_uid);
85ee28daca146e18a99a22f46c0d639e57a6ac95Timo Sirainen ctx->id_box_name = i_strdup(fts_box_get_root(box, &ns));
85ee28daca146e18a99a22f46c0d639e57a6ac95Timo Sirainen mailbox_get_open_status(box, STATUS_UIDVALIDITY, &status);
85ee28daca146e18a99a22f46c0d639e57a6ac95Timo Sirainenfts_backend_solr_update_expunge(struct fts_backend_update_context *_ctx,
85ee28daca146e18a99a22f46c0d639e57a6ac95Timo Sirainen (struct solr_fts_backend_update_context *)_ctx;
d756ebcfa96bd7cff02097c8f26df9df368b81b1Timo Sirainen (void)solr_connection_post(backend->solr_conn, str_c(cmd));
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainenfts_backend_solr_uid_changed(struct solr_fts_backend_update_context *ctx,
c979eeda1f46483d9c963e265786b701d7683d77Timo Sirainen ctx->post = solr_connection_post_begin(backend->solr_conn);
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainenfts_backend_solr_update_set_build_key(struct fts_backend_update_context *_ctx,
c4267cf4c40fb1f866b5958ff122ef836b8c5dfbTimo Sirainen (struct solr_fts_backend_update_context *)_ctx;
d42c9a8f362b76740418c4f9f9441eb7fc661e57Timo Sirainen str_append(ctx->cmd, "<field name=\"body\">");
a2f250a332dfc1e6cd4ffd196c621eb9dbf7b8a1Timo Sirainenfts_backend_solr_update_unset_build_key(struct fts_backend_update_context *_ctx)
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainen (struct solr_fts_backend_update_context *)_ctx;
0b7651dc6ad21cce8579b5957252ae0daf972668Timo Sirainenfts_backend_solr_update_build_more(struct fts_backend_update_context *_ctx,
a2f250a332dfc1e6cd4ffd196c621eb9dbf7b8a1Timo Sirainen (struct solr_fts_backend_update_context *)_ctx;
0b7651dc6ad21cce8579b5957252ae0daf972668Timo Sirainen if (str_len(ctx->cmd) > SOLR_CMDBUF_SIZE-128) {
0b7651dc6ad21cce8579b5957252ae0daf972668Timo Sirainen solr_connection_post_more(ctx->post, str_data(ctx->cmd),
923eb3dde28e4d8841c14fd6b4a69635b7070c3eTimo Sirainenstatic int fts_backend_solr_refresh(struct fts_backend *backend ATTR_UNUSED)
51cbc45fc1ac5dde29bc2adbb175945df1b4f7d4Timo Sirainenstatic int fts_backend_solr_optimize(struct fts_backend *backend ATTR_UNUSED)
5735ada0f82788ee1b5228978d5bd8dad5a04219Timo Sirainensolr_add_definite_query(string_t *str, struct mail_search_arg *arg)
c4267cf4c40fb1f866b5958ff122ef836b8c5dfbTimo Sirainensolr_add_definite_query_args(string_t *str, struct mail_search_arg *arg,
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainenfts_backend_solr_lookup(struct fts_backend *_backend, struct mailbox *box,
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen bool and_args = (flags & FTS_LOOKUP_FLAG_AND_ARGS) != 0;
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen mailbox_get_open_status(box, STATUS_UIDVALIDITY | STATUS_UIDNEXT,
ccd83028a34cc6e2b6370eb7ecf1cf25e717c2d3Timo Sirainen str_printfa(str, "fl=uid,score&rows=%u&sort=uid+asc&q=",
ccd83028a34cc6e2b6370eb7ecf1cf25e717c2d3Timo Sirainen if (!solr_add_definite_query_args(str, args, and_args)) {
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen /* can't search this query */
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen /* use a separate filter query for selecting the mailbox. it shouldn't
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen affect the score and there could be some caching benefits too. */
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen solr_quote_http(str, box->storage->user->username);
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen str_printfa(str, "+%%2Buidv:%u+%%2Bbox:", status.uidvalidity);
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen pool = pool_alloconly_create("fts solr search", 1024);
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen ret = solr_connection_select(backend->solr_conn, str_c(str),
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen if ((flags & FTS_LOOKUP_FLAG_NO_AUTO_FUZZY) == 0)
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen array_append_array(&result->definite_uids, &results[0]->uids);
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen array_append_array(&result->maybe_uids, &results[0]->uids);
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen array_append_array(&result->scores, &results[0]->scores);
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainenmailbox_get_id(struct solr_fts_backend *backend, struct mail_namespace *ns,
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainensolr_search_multi(struct solr_fts_backend *backend, string_t *str,
c4267cf4c40fb1f866b5958ff122ef836b8c5dfbTimo Sirainen HASH_TABLE(char *, struct mailbox *) mailboxes;
ccd83028a34cc6e2b6370eb7ecf1cf25e717c2d3Timo Sirainen unsigned int i, len;
c4267cf4c40fb1f866b5958ff122ef836b8c5dfbTimo Sirainen /* use a separate filter query for selecting the mailbox. it shouldn't
42507d758b053bb483de58fba55c73a9eb5d3fbaTimo Sirainen affect the score and there could be some caching benefits too. */
c4267cf4c40fb1f866b5958ff122ef836b8c5dfbTimo Sirainen solr_quote_http(str, backend->backend.ns->owner->username);
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen hash_table_create(&mailboxes, default_pool, 0, str_hash, strcmp);
ccd83028a34cc6e2b6370eb7ecf1cf25e717c2d3Timo Sirainen mailbox_get_open_status(boxes[i], STATUS_UIDVALIDITY, &status);
ccd83028a34cc6e2b6370eb7ecf1cf25e717c2d3Timo Sirainen str_printfa(str, "%%2B(%%2Buidv:%u+%%2Bbox:", status.uidvalidity);
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen box_id = mailbox_get_id(backend, ns, box_name, status.uidvalidity);
294f1a51763015cda0e2d874c5027d6fe7a2cd54Timo Sirainen hash_table_insert(mailboxes, box_id, boxes[i]);
42507d758b053bb483de58fba55c73a9eb5d3fbaTimo Sirainen if (solr_connection_select(backend->solr_conn, str_c(str),
ccd83028a34cc6e2b6370eb7ecf1cf25e717c2d3Timo Sirainen box = hash_table_lookup(mailboxes, solr_results[i]->box_id);
ccd83028a34cc6e2b6370eb7ecf1cf25e717c2d3Timo Sirainen i_warning("fts_solr: Lookup returned unexpected mailbox "
ccd83028a34cc6e2b6370eb7ecf1cf25e717c2d3Timo Sirainen fts_result = array_append_space(&fts_results);
ccd83028a34cc6e2b6370eb7ecf1cf25e717c2d3Timo Sirainen if ((flags & FTS_LOOKUP_FLAG_NO_AUTO_FUZZY) == 0)
ccd83028a34cc6e2b6370eb7ecf1cf25e717c2d3Timo Sirainen fts_result->definite_uids = solr_results[i]->uids;
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen fts_result->maybe_uids = solr_results[i]->uids;
1939d1843ee6c7ca5e5baa3967b0332341440005Timo Sirainen result->box_results = array_idx_modifiable(&fts_results, 0);
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainenfts_backend_solr_lookup_multi(struct fts_backend *_backend,
ccd83028a34cc6e2b6370eb7ecf1cf25e717c2d3Timo Sirainen bool and_args = (flags & FTS_LOOKUP_FLAG_AND_ARGS) != 0;
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen str_printfa(str, "fl=ns,box,uidv,uid,score&rows=%u&sort=box+asc,uid+asc&q=",
279b22f320f6139da5c1b0e2a5ead6692e7db947Timo Sirainen if (solr_add_definite_query_args(str, args, and_args)) {
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen if (solr_search_multi(backend, str, boxes, flags, result) < 0)
c96eb61168670cfdd7596baba18856d3f086a093Timo Sirainen /* FIXME: maybe_uids could be handled also with some more work.. */