fts-backend-lucene.c revision 03a83759c3ca0825ec899fc3dbb05b04ed0d911e
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik/* Copyright (c) 2006-2011 Dovecot authors, see the included COPYING file */
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik#define LUCENE_INDEX_DIR_NAME "lucene-indexes"
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik#define LUCENE_EXPUNGE_LOG_NAME "dovecot-expunges.log"
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik struct fts_expunge_log_append_ctx *expunge_ctx;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikstatic int fts_backend_lucene_mkdir(struct lucene_fts_backend *backend)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik return mailbox_list_mkdir_root(backend->backend.ns->list,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_lucene_get_mailbox_guid(struct mailbox *box, mail_guid_128_t *guid_r)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (mailbox_get_metadata(box, MAILBOX_METADATA_GUID,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik i_error("lucene: Couldn't get mailbox %s GUID: %s",
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik box->vname, mailbox_get_last_error(box, NULL));
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik memcpy(guid_r, metadata.guid, MAIL_GUID_128_SIZE);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_select(struct lucene_fts_backend *backend, struct mailbox *box)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik unsigned char guid_hex[MAILBOX_GUID_HEX_LENGTH];
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik unsigned int i;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik backend->selected_box_generation == box->generation_sequence)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (fts_lucene_get_mailbox_guid(box, &guid) < 0)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik buffer_create_data(&buf, guid_hex, MAILBOX_GUID_HEX_LENGTH);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik binary_to_hex_append(&buf, guid, MAIL_GUID_128_SIZE);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik lucene_index_select_mailbox(backend->index, wguid_hex);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik lucene_index_unselect_mailbox(backend->index);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikstatic struct fts_backend *fts_backend_lucene_alloc(void)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik backend = i_new(struct lucene_fts_backend, 1);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_init(struct fts_backend *_backend,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik FTS_LUCENE_USER_CONTEXT(_backend->ns->user);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik path = mailbox_list_get_path(_backend->ns->list, NULL,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik i_assert(path != NULL); /* fts already checked this */
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik backend->dir_path = i_strconcat(path, "/"LUCENE_INDEX_DIR_NAME, NULL);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik backend->index = lucene_index_init(backend->dir_path,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik backend->index = lucene_index_init(backend->dir_path,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik path = t_strconcat(backend->dir_path, "/"LUCENE_EXPUNGE_LOG_NAME, NULL);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik backend->expunge_log = fts_expunge_log_init(path);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikstatic void fts_backend_lucene_deinit(struct fts_backend *_backend)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik fts_expunge_log_deinit(&backend->expunge_log);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_get_last_uid(struct fts_backend *_backend,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (fts_index_get_last_uid(box, last_uid_r))
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik /* either nothing has been indexed, or the index was corrupted.
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik do it the slow way. */
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (lucene_index_get_last_uid(backend->index, last_uid_r) < 0)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik (void)fts_index_set_last_uid(box, *last_uid_r);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_update_init(struct fts_backend *_backend)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik struct lucene_fts_backend_update_context *ctx;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik ctx = i_new(struct lucene_fts_backend_update_context, 1);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_need_optimize(struct lucene_fts_backend_update_context *ctx)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik (struct lucene_fts_backend *)ctx->ctx.backend;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (ctx->added_msgs >= LUCENE_OPTIMIZE_BATCH_MSGS_COUNT)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (lucene_index_get_doc_count(backend->index, &numdocs) < 0)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (fts_expunge_log_uid_count(backend->expunge_log, &expunges) < 0)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik numdocs / expunges <= 50; /* >2% of index has been expunged */
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_update_deinit(struct fts_backend_update_context *_ctx)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik struct lucene_fts_backend_update_context *ctx =
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik (struct lucene_fts_backend_update_context *)_ctx;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (fts_expunge_log_append_commit(&ctx->expunge_ctx) < 0)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (fts_backend_lucene_need_optimize(ctx)) {
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik struct mail_user *user = backend->backend.ns->user;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik /* the optimize affects all mailboxes within namespace,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik so just use any mailbox name in it */
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik cmd = t_strdup_printf("OPTIMIZE\t0\t%s\t%s\n",
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_update_set_mailbox(struct fts_backend_update_context *_ctx,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik struct lucene_fts_backend_update_context *ctx =
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik (struct lucene_fts_backend_update_context *)_ctx;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik (void)fts_index_set_last_uid(ctx->box, ctx->last_uid);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik ctx->first_box_vname = i_strdup(box->vname);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_update_expunge(struct fts_backend_update_context *_ctx,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik struct lucene_fts_backend_update_context *ctx =
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik (struct lucene_fts_backend_update_context *)_ctx;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (!fts_index_get_last_uid(ctx->box, &ctx->last_indexed_uid))
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik /* don't waste time adding expunge to log for a message that
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik isn't even indexed. this check is racy, because indexer may
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik just be in the middle of indexing this message. we'll
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik attempt to avoid that by skipping the expunging only if
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik indexing hasn't been done for a while (100 msgs). */
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik fts_expunge_log_append_begin(backend->expunge_log);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (fts_backend_select(backend, ctx->box) < 0)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik fts_expunge_log_append_next(ctx->expunge_ctx,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_update_set_build_key(struct fts_backend_update_context *_ctx,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik struct lucene_fts_backend_update_context *ctx =
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik (struct lucene_fts_backend_update_context *)_ctx;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (lucene_index_build_init(backend->index) < 0)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (fts_backend_select(backend, ctx->box) < 0)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik case FTS_BACKEND_BUILD_KEY_BODY_PART_BINARY:
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_update_unset_build_key(struct fts_backend_update_context *_ctx)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik struct lucene_fts_backend_update_context *ctx =
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik (struct lucene_fts_backend_update_context *)_ctx;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_update_build_more(struct fts_backend_update_context *_ctx,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik struct lucene_fts_backend_update_context *ctx =
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik (struct lucene_fts_backend_update_context *)_ctx;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik ret = lucene_index_build_more(backend->index, ctx->uid,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_refresh(struct fts_backend *_backend)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikstatic int fts_backend_lucene_rescan(struct fts_backend *_backend)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (lucene_index_rescan(backend->index, _backend->ns->list) < 0)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik return lucene_index_optimize(backend->index);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikstatic int fts_backend_lucene_optimize(struct fts_backend *_backend)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik ret = lucene_index_expunge_from_log(backend->index,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik /* log was corrupted, need to rescan */
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik ret = lucene_index_rescan(backend->index, _backend->ns->list);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik ret = lucene_index_optimize(backend->index);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_lookup(struct fts_backend *_backend, struct mailbox *box,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik struct mail_search_arg *args, bool and_args,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik ret = lucene_index_lookup(backend->index, args, and_args,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik/* a char* hash function from ASU -- from glib */
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikstatic unsigned int wstr_hash(const void *p)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik unsigned int g, h = 0;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik while (*s != '\0') {
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik h = (h << 4) + *s;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if ((g = h & 0xf0000000UL)) {
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik h = h ^ (g >> 24);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikmailboxes_get_guids(struct mailbox *const boxes[],
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik struct hash_table *guids, struct fts_multi_result *result)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik ARRAY_DEFINE(box_results, struct fts_result);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik unsigned int i, j;
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik p_array_init(&box_results, result->pool, 32);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik if (fts_mailbox_get_guid(boxes[i], &guid) < 0)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik i_assert(strlen(guid) == MAILBOX_GUID_HEX_LENGTH);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik guid_dup = t_new(wchar_t, MAILBOX_GUID_HEX_LENGTH + 1);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik for (j = 0; j < MAILBOX_GUID_HEX_LENGTH; j++)
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik box_result = array_append_space(&box_results);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik hash_table_insert(guids, guid_dup, box_result);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik result->box_results = array_idx_modifiable(&box_results, 0);
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnikfts_backend_lucene_lookup_multi(struct fts_backend *_backend,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik struct mail_search_arg *args, bool and_args,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik guids = hash_table_create(default_pool, default_pool, 0,
49a5412cbc98e630de17359c29cb8d6ce0e16168Lukas Slebodnik ret = mailboxes_get_guids(boxes, guids, result);