fts-backend-solr.c revision adf8264ab1135c413bcede6af2e4248fd26a1ef9
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher/* Copyright (c) 2006-2011 Dovecot authors, see the included COPYING file */
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
ee359fe1384507fed6c2274e7bfe81d288de4542Stephen Gallagher#include "lib.h"
33396dc46ea52c18f47db1b5d590880806521005Sumit Bose#include "array.h"
ee359fe1384507fed6c2274e7bfe81d288de4542Stephen Gallagher#include "str.h"
33396dc46ea52c18f47db1b5d590880806521005Sumit Bose#include "hash.h"
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher#include "strescape.h"
324fb26ba803a999bedc29e93c46c84f27abf5b7Sumit Bose#include "unichar.h"
324fb26ba803a999bedc29e93c46c84f27abf5b7Sumit Bose#include "mail-storage-private.h"
324fb26ba803a999bedc29e93c46c84f27abf5b7Sumit Bose#include "mailbox-list-private.h"
324fb26ba803a999bedc29e93c46c84f27abf5b7Sumit Bose#include "mail-search.h"
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher#include "fts-api.h"
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher#include "solr-connection.h"
84ae5edab16ad6be5e3be956cb6fa031c1428eb5Stephen Gallagher#include "fts-solr-plugin.h"
84ae5edab16ad6be5e3be956cb6fa031c1428eb5Stephen Gallagher
84ae5edab16ad6be5e3be956cb6fa031c1428eb5Stephen Gallagher#include <ctype.h>
e65df5b966b27e13283c65f59f99ac44781e0333Simo Sorce
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher#define SOLR_CMDBUF_SIZE (1024*64)
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher#define SOLR_MAX_MULTI_ROWS 100000
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherstruct solr_fts_backend {
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher struct fts_backend backend;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher};
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
84ae5edab16ad6be5e3be956cb6fa031c1428eb5Stephen Gallagherstruct solr_fts_backend_update_context {
cc98edd9479d4622634a1275c98058916c14059aStephen Gallagher struct fts_backend_update_context ctx;
ee359fe1384507fed6c2274e7bfe81d288de4542Stephen Gallagher
cc98edd9479d4622634a1275c98058916c14059aStephen Gallagher struct mailbox *cur_box;
d3da1c165cdb4c1ec126a8f4b6b544ca415b9d20Pavel Březina char box_guid[MAILBOX_GUID_HEX_LENGTH+1];
d3da1c165cdb4c1ec126a8f4b6b544ca415b9d20Pavel Březina
d3da1c165cdb4c1ec126a8f4b6b544ca415b9d20Pavel Březina struct solr_connection_post *post;
1183d29d87c5c7439cf2364b7d7324d4a13b6e35Stephen Gallagher uint32_t prev_uid;
1183d29d87c5c7439cf2364b7d7324d4a13b6e35Stephen Gallagher string_t *cmd, *hdr, *hdr_fields;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher bool headers_open;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher bool cur_header_index;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher bool documents_added;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher};
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherstatic struct solr_connection *solr_conn = NULL;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherstatic bool is_valid_xml_char(unichar_t chr)
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher{
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher /* Valid characters in XML:
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
c89589fa349f38214c9cb8d9389c0fd557e5dca2Simo Sorce [#x10000-#x10FFFF]
c89589fa349f38214c9cb8d9389c0fd557e5dca2Simo Sorce
c89589fa349f38214c9cb8d9389c0fd557e5dca2Simo Sorce This function gets called only for #x80 and higher */
c89589fa349f38214c9cb8d9389c0fd557e5dca2Simo Sorce if (chr > 0xd7ff && chr < 0xe000)
c89589fa349f38214c9cb8d9389c0fd557e5dca2Simo Sorce return FALSE;
c89589fa349f38214c9cb8d9389c0fd557e5dca2Simo Sorce if (chr > 0xfffd && chr < 0x10000)
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher return FALSE;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher return chr < 0x10ffff;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher}
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherstatic void
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherxml_encode_data(string_t *dest, const unsigned char *data, unsigned int len)
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher{
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher unichar_t chr;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher unsigned int i;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher for (i = 0; i < len; i++) {
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher switch (data[i]) {
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher case '&':
d921c1eba437662437847279f251a0a5d8f70127Maxim str_append(dest, "&amp;");
d921c1eba437662437847279f251a0a5d8f70127Maxim break;
d921c1eba437662437847279f251a0a5d8f70127Maxim case '<':
d921c1eba437662437847279f251a0a5d8f70127Maxim str_append(dest, "&lt;");
d921c1eba437662437847279f251a0a5d8f70127Maxim break;
d921c1eba437662437847279f251a0a5d8f70127Maxim case '>':
d921c1eba437662437847279f251a0a5d8f70127Maxim str_append(dest, "&gt;");
327127bb7fcc07f882209f029e14026de1b23c94Maxim break;
327127bb7fcc07f882209f029e14026de1b23c94Maxim case '\t':
327127bb7fcc07f882209f029e14026de1b23c94Maxim case '\n':
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher case '\r':
d3da1c165cdb4c1ec126a8f4b6b544ca415b9d20Pavel Březina /* exceptions to the following control char check */
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher str_append_c(dest, data[i]);
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher break;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher default:
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher if (data[i] < 32) {
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher /* SOLR doesn't like control characters.
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher replace them with spaces. */
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher str_append_c(dest, ' ');
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher } else if (data[i] >= 0x80) {
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher /* make sure the character is valid for XML
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher so we don't get XML parser errors */
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher unsigned int char_len =
4b6a0d0b3d42e5fdb457f47d9adfa5e66b160256Stephen Gallagher uni_utf8_char_bytes(data[i]);
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher if (i + char_len <= len &&
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher uni_utf8_get_char_n(data + i, char_len, &chr) == 1 &&
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher is_valid_xml_char(chr))
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher str_append_n(dest, data + i, char_len);
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher else {
068dbee9ca7bf5b37330eff91c94ae10f288d09fJakub Hrozek str_append_n(dest, utf8_replacement_char,
98ce3c3e85a4bb2e1822bf8ab2a1c2ab9e3dd61dJakub Hrozek UTF8_REPLACEMENT_CHAR_LEN);
be65f065fef1d387281096ef095a2acef39ecc12Jakub Hrozek }
f36078af138f052cd9a30360867b0ebd0805af5eJakub Hrozek i += char_len - 1;
34c78b745eb349eef2b0f13ef2b722632aebe619Jan Cholasta } else {
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher str_append_c(dest, data[i]);
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher }
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher break;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher }
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher }
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher}
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherstatic void xml_encode(string_t *dest, const char *str)
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher{
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher xml_encode_data(dest, (const unsigned char *)str, strlen(str));
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher}
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherstatic void solr_quote_http(string_t *dest, const char *str)
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher{
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher str_append(dest, "%22");
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher solr_connection_http_escape(solr_conn, dest, str);
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher str_append(dest, "%22");
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher}
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherstatic struct fts_backend *fts_backend_solr_alloc(void)
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher{
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher struct solr_fts_backend *backend;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher backend = i_new(struct solr_fts_backend, 1);
2a5790216f57e9bdfb2930d52860bb5300366536Jakub Hrozek backend->backend = fts_backend_solr;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher return &backend->backend;
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagher}
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagher
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagherstatic int
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagherfts_backend_solr_init(struct fts_backend *_backend,
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagher const char **error_r ATTR_UNUSED)
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagher{
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagher struct fts_solr_user *fuser = FTS_SOLR_USER_CONTEXT(_backend->ns->user);
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagher const struct fts_solr_settings *set = &fuser->set;
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagher
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagher if (solr_conn == NULL)
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagher solr_conn = solr_connection_init(set->url, set->debug);
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagher return 0;
b32159300fea63222d8dd9200ed634087704ea74Stephen Gallagher}
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
2a5790216f57e9bdfb2930d52860bb5300366536Jakub Hrozekstatic void fts_backend_solr_deinit(struct fts_backend *_backend)
e6e26182d58c05d896f72f2925426658a6dc70b5Jakub Hrozek{
e6e26182d58c05d896f72f2925426658a6dc70b5Jakub Hrozek struct solr_fts_backend *backend = (struct solr_fts_backend *)_backend;
e6e26182d58c05d896f72f2925426658a6dc70b5Jakub Hrozek
2a5790216f57e9bdfb2930d52860bb5300366536Jakub Hrozek i_free(backend);
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher}
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherstatic int
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherget_last_uid_fallback(struct fts_backend *_backend, struct mailbox *box,
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher uint32_t *last_uid_r)
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher{
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher const struct seq_range *uidvals;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher const char *box_guid;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher unsigned int count;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher struct solr_result **results;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher string_t *str;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher pool_t pool;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher int ret = 0;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher str = t_str_new(256);
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher str_append(str, "fl=uid&rows=1&sort=uid+desc&q=");
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher if (fts_mailbox_get_guid(box, &box_guid) < 0)
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek return -1;
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher str_printfa(str, "box:%s+user:", box_guid);
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek if (_backend->ns->owner != NULL)
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek solr_quote_http(str, _backend->ns->owner->username);
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek else
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek str_append(str, "%22%22");
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek pool = pool_alloconly_create("solr last uid lookup", 1024);
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek if (solr_connection_select(solr_conn, str_c(str),
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek pool, &results) < 0)
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek ret = -1;
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek else if (results[0] == NULL) {
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek /* no UIDs */
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek *last_uid_r = 0;
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek } else {
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek uidvals = array_get(&results[0]->uids, &count);
6f51c802311fd81a409a26763ed45b28a3234d0dJakub Hrozek i_assert(count > 0);
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek if (count == 1 && uidvals[0].seq1 == uidvals[0].seq2) {
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher *last_uid_r = uidvals[0].seq1;
4d81fe27ced3d2e96866aeaf61661a925cb8edf1Jakub Hrozek } else {
8b1f525acd20f36c836e827de3c251088961c5d9Stephen Gallagher i_error("fts_solr: Last UID lookup returned multiple rows");
f5b6f977d4144c28e9c66f3f1c9d634d595d1117Marko Myllynen ret = -1;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher }
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher }
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher pool_unref(&pool);
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher return ret;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher}
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
72e60fd4eabcfbcdbfe01e8c38b94052bc6c2067Jakub Hrozekstatic int
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherfts_backend_solr_get_last_uid(struct fts_backend *_backend,
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher struct mailbox *box, uint32_t *last_uid_r)
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher{
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher struct fts_index_header hdr;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher if (fts_index_get_header(box, &hdr)) {
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher *last_uid_r = hdr.last_indexed_uid;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher return 0;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher }
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
5352c9b3609bca63814f9f6f03dbbbadf6c6333aStephen Gallagher /* either nothing has been indexed, or the index was corrupted.
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher do it the slow way. */
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher if (get_last_uid_fallback(_backend, box, last_uid_r) < 0)
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher return -1;
2a5790216f57e9bdfb2930d52860bb5300366536Jakub Hrozek
2a5790216f57e9bdfb2930d52860bb5300366536Jakub Hrozek (void)fts_index_set_last_uid(box, *last_uid_r);
2a5790216f57e9bdfb2930d52860bb5300366536Jakub Hrozek return 0;
2a5790216f57e9bdfb2930d52860bb5300366536Jakub Hrozek}
505383ec905863bb8f4f563f694b9bf077f9002cJakub Hrozek
505383ec905863bb8f4f563f694b9bf077f9002cJakub Hrozekstatic struct fts_backend_update_context *
172c07013d1ea99447a780fd36f49d5c3a76981bJakub Hrozekfts_backend_solr_update_init(struct fts_backend *_backend)
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher{
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher struct solr_fts_backend_update_context *ctx;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher ctx = i_new(struct solr_fts_backend_update_context, 1);
336879aabae137f9a81304f147fb0d43001654b0Simo Sorce ctx->ctx.backend = _backend;
336879aabae137f9a81304f147fb0d43001654b0Simo Sorce ctx->cmd = str_new(default_pool, SOLR_CMDBUF_SIZE);
336879aabae137f9a81304f147fb0d43001654b0Simo Sorce ctx->hdr = str_new(default_pool, 4096);
336879aabae137f9a81304f147fb0d43001654b0Simo Sorce ctx->hdr_fields = str_new(default_pool, 1024);
336879aabae137f9a81304f147fb0d43001654b0Simo Sorce return &ctx->ctx;
336879aabae137f9a81304f147fb0d43001654b0Simo Sorce}
336879aabae137f9a81304f147fb0d43001654b0Simo Sorce
336879aabae137f9a81304f147fb0d43001654b0Simo Sorcestatic void xml_encode_id(struct solr_fts_backend_update_context *ctx,
336879aabae137f9a81304f147fb0d43001654b0Simo Sorce string_t *str, uint32_t uid)
336879aabae137f9a81304f147fb0d43001654b0Simo Sorce{
336879aabae137f9a81304f147fb0d43001654b0Simo Sorce str_printfa(str, "%u/%s", uid, ctx->box_guid);
336879aabae137f9a81304f147fb0d43001654b0Simo Sorce if (ctx->ctx.backend->ns->owner != NULL) {
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher str_append_c(str, '/');
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher xml_encode(str, ctx->ctx.backend->ns->owner->username);
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher }
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher}
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherstatic void
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagherfts_backend_solr_doc_open(struct solr_fts_backend_update_context *ctx,
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher uint32_t uid)
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher{
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher ctx->documents_added = TRUE;
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher str_printfa(ctx->cmd, "<doc>"
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher "<field name=\"uid\">%u</field>"
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher "<field name=\"box\">%s</field>",
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher uid, ctx->box_guid);
551aa6c36797ed720487f5974dcadabf19e6ff9fStephen Gallagher str_append(ctx->cmd, "<field name=\"user\">");
96c73559adfbdac96720008fc022cb1d540b53c3Jakub Hrozek if (ctx->ctx.backend->ns->owner != NULL)
4dd615c01357b8715711aad6820ba9595d3ad377Stephen Gallagher xml_encode(ctx->cmd, ctx->ctx.backend->ns->owner->username);
70e59ed31c5a9c9ed02d9065ddf92be87c887efbJakub Hrozek str_append(ctx->cmd, "</field>");
f660877b38e563c4aa0cb1431624069808873fecJakub Hrozek
f660877b38e563c4aa0cb1431624069808873fecJakub Hrozek str_printfa(ctx->cmd, "<field name=\"id\">");
84ae5edab16ad6be5e3be956cb6fa031c1428eb5Stephen Gallagher xml_encode_id(ctx, ctx->cmd, uid);
str_append(ctx->cmd, "</field>");
}
static void
fts_backend_solr_doc_close(struct solr_fts_backend_update_context *ctx)
{
ctx->headers_open = FALSE;
if (str_len(ctx->hdr) > 0) {
str_append(ctx->cmd, "<field name=\"hdr\">");
str_append_str(ctx->cmd, ctx->hdr);
str_append(ctx->cmd, "</field>");
str_truncate(ctx->hdr, 0);
}
if (str_len(ctx->hdr_fields) > 0) {
str_append_str(ctx->cmd, ctx->hdr_fields);
str_truncate(ctx->hdr_fields, 0);
}
str_append(ctx->cmd, "</doc>");
}
static int
fts_backed_solr_build_commit(struct solr_fts_backend_update_context *ctx)
{
if (ctx->post == NULL)
return 0;
fts_backend_solr_doc_close(ctx);
str_append(ctx->cmd, "</add>");
solr_connection_post_more(ctx->post, str_data(ctx->cmd),
str_len(ctx->cmd));
return solr_connection_post_end(ctx->post);
}
static int
fts_backend_solr_update_deinit(struct fts_backend_update_context *_ctx)
{
struct solr_fts_backend_update_context *ctx =
(struct solr_fts_backend_update_context *)_ctx;
const char *str;
int ret = _ctx->failed ? -1 : 0;
if (fts_backed_solr_build_commit(ctx) < 0)
ret = -1;
/* commit and wait until the documents we just indexed are
visible to the following search */
str = t_strdup_printf("<commit waitFlush=\"false\" "
"waitSearcher=\"%s\"/>",
ctx->documents_added ? "true" : "false");
if (solr_connection_post(solr_conn, str) < 0)
ret = -1;
str_free(&ctx->cmd);
str_free(&ctx->hdr);
str_free(&ctx->hdr_fields);
i_free(ctx);
return ret;
}
static void
fts_backend_solr_update_set_mailbox(struct fts_backend_update_context *_ctx,
struct mailbox *box)
{
struct solr_fts_backend_update_context *ctx =
(struct solr_fts_backend_update_context *)_ctx;
const char *box_guid;
if (ctx->prev_uid != 0) {
(void)fts_index_set_last_uid(ctx->cur_box, ctx->prev_uid);
ctx->prev_uid = 0;
}
if (box != NULL) {
if (fts_mailbox_get_guid(box, &box_guid) < 0)
_ctx->failed = TRUE;
i_assert(strlen(box_guid) == sizeof(ctx->box_guid)-1);
memcpy(ctx->box_guid, box_guid, sizeof(ctx->box_guid)-1);
} else {
memset(ctx->box_guid, 0, sizeof(ctx->box_guid));
}
ctx->cur_box = box;
}
static void
fts_backend_solr_update_expunge(struct fts_backend_update_context *_ctx,
uint32_t uid)
{
struct solr_fts_backend_update_context *ctx =
(struct solr_fts_backend_update_context *)_ctx;
T_BEGIN {
string_t *cmd;
cmd = t_str_new(256);
str_append(cmd, "<delete><id>");
xml_encode_id(ctx, cmd, uid);
str_append(cmd, "</id></delete>");
(void)solr_connection_post(solr_conn, str_c(cmd));
} T_END;
}
static void
fts_backend_solr_uid_changed(struct solr_fts_backend_update_context *ctx,
uint32_t uid)
{
if (ctx->post == NULL) {
i_assert(ctx->prev_uid == 0);
ctx->post = solr_connection_post_begin(solr_conn);
str_append(ctx->cmd, "<add>");
} else {
fts_backend_solr_doc_close(ctx);
}
ctx->prev_uid = uid;
fts_backend_solr_doc_open(ctx, uid);
}
static bool
fts_backend_solr_update_set_build_key(struct fts_backend_update_context *_ctx,
const struct fts_backend_build_key *key)
{
struct solr_fts_backend_update_context *ctx =
(struct solr_fts_backend_update_context *)_ctx;
if (key->uid != ctx->prev_uid)
fts_backend_solr_uid_changed(ctx, key->uid);
switch (key->type) {
case FTS_BACKEND_BUILD_KEY_HDR:
if (fts_header_want_indexed(key->hdr_name)) {
ctx->cur_header_index = TRUE;
str_printfa(ctx->hdr_fields, "<field name=\"%s\">",
t_str_lcase(key->hdr_name));
}
/* fall through */
case FTS_BACKEND_BUILD_KEY_MIME_HDR:
xml_encode(ctx->hdr, key->hdr_name);
str_append(ctx->hdr, ": ");
ctx->headers_open = TRUE;
break;
case FTS_BACKEND_BUILD_KEY_BODY_PART:
ctx->headers_open = FALSE;
str_append(ctx->cmd, "<field name=\"body\">");
break;
case FTS_BACKEND_BUILD_KEY_BODY_PART_BINARY:
i_unreached();
}
return TRUE;
}
static void
fts_backend_solr_update_unset_build_key(struct fts_backend_update_context *_ctx)
{
struct solr_fts_backend_update_context *ctx =
(struct solr_fts_backend_update_context *)_ctx;
if (!ctx->headers_open)
str_append(ctx->cmd, "</field>");
else {
/* this is called individually for each header line.
headers are finished only when key changes to body */
str_append_c(ctx->hdr, '\n');
}
if (ctx->cur_header_index) {
str_append(ctx->hdr_fields, "</field>");
ctx->cur_header_index = FALSE;
}
}
static int
fts_backend_solr_update_build_more(struct fts_backend_update_context *_ctx,
const unsigned char *data, size_t size)
{
struct solr_fts_backend_update_context *ctx =
(struct solr_fts_backend_update_context *)_ctx;
if (_ctx->failed)
return -1;
if (ctx->headers_open) {
if (ctx->cur_header_index)
xml_encode_data(ctx->hdr_fields, data, size);
xml_encode_data(ctx->hdr, data, size);
} else {
i_assert(!ctx->cur_header_index);
xml_encode_data(ctx->cmd, data, size);
}
if (str_len(ctx->cmd) > SOLR_CMDBUF_SIZE-128) {
solr_connection_post_more(ctx->post, str_data(ctx->cmd),
str_len(ctx->cmd));
str_truncate(ctx->cmd, 0);
}
return 0;
}
static int fts_backend_solr_refresh(struct fts_backend *backend ATTR_UNUSED)
{
return 0;
}
static int fts_backend_solr_optimize(struct fts_backend *backend ATTR_UNUSED)
{
return 0;
}
static bool solr_need_escaping(const char *str)
{
const char *solr_escape_chars = "+-&|!(){}[]^\"~*?:\\ ";
for (; *str != '\0'; str++) {
if (strchr(solr_escape_chars, *str) != NULL)
return TRUE;
}
return FALSE;
}
static void solr_add_str_arg(string_t *str, struct mail_search_arg *arg)
{
/* currently we'll just disable fuzzy searching if there are any
parameters that need escaping. solr doesn't seem to give good
fuzzy results even if we did escape them.. */
if (!arg->fuzzy || solr_need_escaping(arg->value.str))
solr_quote_http(str, arg->value.str);
else {
str_append(str, arg->value.str);
str_append_c(str, '~');
}
}
static bool
solr_add_definite_query(string_t *str, struct mail_search_arg *arg)
{
switch (arg->type) {
case SEARCH_TEXT: {
if (arg->match_not)
str_append_c(str, '-');
str_append(str, "(hdr:");
solr_add_str_arg(str, arg);
str_append(str, "+OR+body:");
solr_add_str_arg(str, arg);
str_append(str, ")");
break;
}
case SEARCH_BODY:
if (arg->match_not)
str_append_c(str, '-');
str_append(str, "body:");
solr_add_str_arg(str, arg);
break;
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
case SEARCH_HEADER_COMPRESS_LWSP:
if (!fts_header_want_indexed(arg->hdr_field_name))
return FALSE;
if (arg->match_not)
str_append_c(str, '-');
str_append(str, arg->hdr_field_name);
str_append_c(str, ':');
solr_add_str_arg(str, arg);
break;
default:
return FALSE;
}
return TRUE;
}
static bool
solr_add_definite_query_args(string_t *str, struct mail_search_arg *arg,
bool and_args)
{
unsigned int last_len;
last_len = str_len(str);
for (; arg != NULL; arg = arg->next) {
if (solr_add_definite_query(str, arg)) {
arg->match_always = TRUE;
last_len = str_len(str);
if (and_args)
str_append(str, "+AND+");
else
str_append(str, "+OR+");
}
}
if (str_len(str) == last_len)
return FALSE;
str_truncate(str, last_len);
return TRUE;
}
static bool
solr_add_maybe_query(string_t *str, struct mail_search_arg *arg)
{
switch (arg->type) {
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
case SEARCH_HEADER_COMPRESS_LWSP:
if (fts_header_want_indexed(arg->hdr_field_name))
return FALSE;
if (arg->match_not) {
/* all matches would be definite, but all non-matches
would be maybies. too much trouble to optimize. */
return FALSE;
}
/* we can check if the search key exists in some header and
filter out the messages that have no chance of matching */
str_append(str, "hdr:");
if (*arg->value.str != '\0')
solr_quote_http(str, arg->value.str);
else {
/* checking potential existence of the header name */
solr_quote_http(str, arg->hdr_field_name);
}
break;
default:
return FALSE;
}
return TRUE;
}
static bool
solr_add_maybe_query_args(string_t *str, struct mail_search_arg *arg,
bool and_args)
{
unsigned int last_len;
last_len = str_len(str);
for (; arg != NULL; arg = arg->next) {
if (solr_add_maybe_query(str, arg)) {
arg->match_always = TRUE;
last_len = str_len(str);
if (and_args)
str_append(str, "+AND+");
else
str_append(str, "+OR+");
}
}
if (str_len(str) == last_len)
return FALSE;
str_truncate(str, last_len);
return TRUE;
}
static int solr_search(struct fts_backend *_backend, string_t *str,
const char *box_guid, ARRAY_TYPE(seq_range) *uids_r,
ARRAY_TYPE(fts_score_map) *scores_r)
{
pool_t pool = pool_alloconly_create("fts solr search", 1024);
struct solr_result **results;
int ret;
/* use a separate filter query for selecting the mailbox. it shouldn't
affect the score and there could be some caching benefits too. */
str_printfa(str, "&fq=%%2Bbox:%s+%%2Buser:", box_guid);
if (_backend->ns->owner != NULL)
solr_quote_http(str, _backend->ns->owner->username);
else
str_append(str, "%22%22");
ret = solr_connection_select(solr_conn, str_c(str), pool, &results);
if (ret == 0 && results[0] != NULL) {
array_append_array(uids_r, &results[0]->uids);
array_append_array(scores_r, &results[0]->scores);
}
pool_unref(&pool);
return ret;
}
static int
fts_backend_solr_lookup(struct fts_backend *_backend, struct mailbox *box,
struct mail_search_arg *args, bool and_args,
struct fts_result *result)
{
struct mailbox_status status;
string_t *str;
const char *box_guid;
unsigned int prefix_len;
if (fts_mailbox_get_guid(box, &box_guid) < 0)
return -1;
mailbox_get_open_status(box, STATUS_UIDNEXT, &status);
str = t_str_new(256);
str_printfa(str, "fl=uid,score&rows=%u&sort=uid+asc&q=",
status.uidnext);
prefix_len = str_len(str);
if (solr_add_definite_query_args(str, args, and_args)) {
if (solr_search(_backend, str, box_guid,
&result->definite_uids, &result->scores) < 0)
return -1;
}
str_truncate(str, prefix_len);
if (solr_add_maybe_query_args(str, args, and_args)) {
if (solr_search(_backend, str, box_guid,
&result->maybe_uids, &result->scores) < 0)
return -1;
}
result->scores_sorted = TRUE;
return 0;
}
static int
solr_search_multi(struct fts_backend *_backend, string_t *str,
struct mailbox *const boxes[],
struct fts_multi_result *result)
{
struct solr_result **solr_results;
struct fts_result *fts_result;
ARRAY_DEFINE(fts_results, struct fts_result);
struct hash_table *mailboxes;
struct mailbox *box;
const char *box_guid;
unsigned int i, len;
/* use a separate filter query for selecting the mailbox. it shouldn't
affect the score and there could be some caching benefits too. */
str_append(str, "&fq=%2Buser:");
if (_backend->ns->owner != NULL)
solr_quote_http(str, _backend->ns->owner->username);
else
str_append(str, "%22%22");
mailboxes = hash_table_create(default_pool, default_pool, 0,
str_hash, (hash_cmp_callback_t *)strcmp);
str_append(str, "%2B(");
len = str_len(str);
for (i = 0; boxes[i] != NULL; i++) {
if (fts_mailbox_get_guid(boxes[i], &box_guid) < 0)
continue;
if (str_len(str) != len)
str_append(str, "+OR+");
str_printfa(str, "box:%s", box_guid);
hash_table_insert(mailboxes, t_strdup_noconst(box_guid),
boxes[i]);
}
str_append_c(str, ')');
if (solr_connection_select(solr_conn, str_c(str),
result->pool, &solr_results) < 0) {
hash_table_destroy(&mailboxes);
return -1;
}
p_array_init(&fts_results, result->pool, 32);
for (i = 0; solr_results[i] != NULL; i++) {
box = hash_table_lookup(mailboxes, solr_results[i]->box_id);
if (box == NULL) {
i_warning("fts_solr: Lookup returned unexpected mailbox "
"with guid=%s", solr_results[i]->box_id);
continue;
}
fts_result = array_append_space(&fts_results);
fts_result->box = box;
fts_result->definite_uids = solr_results[i]->uids;
fts_result->scores = solr_results[i]->scores;
fts_result->scores_sorted = TRUE;
}
(void)array_append_space(&fts_results);
result->box_results = array_idx_modifiable(&fts_results, 0);
hash_table_destroy(&mailboxes);
return 0;
}
static int
fts_backend_solr_lookup_multi(struct fts_backend *backend,
struct mailbox *const boxes[],
struct mail_search_arg *args, bool and_args,
struct fts_multi_result *result)
{
string_t *str;
str = t_str_new(256);
str_printfa(str, "fl=box,uid,score&rows=%u&sort=box+asc,uid+asc&q=",
SOLR_MAX_MULTI_ROWS);
if (solr_add_definite_query_args(str, args, and_args)) {
if (solr_search_multi(backend, str, boxes, result) < 0)
return -1;
}
/* FIXME: maybe_uids could be handled also with some more work.. */
return 0;
}
struct fts_backend fts_backend_solr = {
.name = "solr",
.flags = 0,
{
fts_backend_solr_alloc,
fts_backend_solr_init,
fts_backend_solr_deinit,
fts_backend_solr_get_last_uid,
fts_backend_solr_update_init,
fts_backend_solr_update_deinit,
fts_backend_solr_update_set_mailbox,
fts_backend_solr_update_expunge,
fts_backend_solr_update_set_build_key,
fts_backend_solr_update_unset_build_key,
fts_backend_solr_update_build_more,
fts_backend_solr_refresh,
NULL,
fts_backend_solr_optimize,
fts_backend_default_can_lookup,
fts_backend_solr_lookup,
fts_backend_solr_lookup_multi,
NULL
}
};