imap-thread.c revision 59151b71059df1190acd75d8717ed04a7920c862
/* Copyright (C) 2002-2006 Timo Sirainen */
/* Implementation of draft-ietf-imapext-thread-12 threading algorithm
Message threads are permanently stored using mail-hash API. The links
are built according to normal threading rules. Base subject grouping
however isn't saved, so it needs to be done the slow way.
We do this permanent storing and optimization only when we're threading
all messages, which is what pretty much all the webmails do. Otherwise
we'd need to have separate thread trees for each search query, which
isn't practical.
threading rules, so no further sorting is required to be done. Dummy nodes
are sorted by their children, and since their children are also sorted, it
practically means that they need to be moved whenever their first child
is changed.
Adding new messages to the hash is easy and fast. Removing messages
however might cause changes all over the thread tree. Luckily this is
rare so we can just optimize the common cases. This is done with reference
counts:
A node is created for each seen Message-ID. We also store the dummy ones
for which no actual message exists. Each time a Message-ID is seen in the
In-Reply-To or References headers, the Message-ID node's reference count
is increased.
Expunging then decreases reference counts for each Message-ID. There are
two rare problematic cases when expunging:
1) Duplicate Message-IDs
2) Broken parent-child relationships:
a) different messages describe them differently in their References
headers
b) loops
For these cases expunging a message affecting these may cause larger
thread reorganizations. We mark these with expunge_rebuilds flag, so that
if the problematic message is expunged, we'll just rebuild everything.
When a message is expunged, either its reference count drops to zero in
which case it's removed completely, otherwise it's turned into a dummy
node.
Typically when reference count drops to zero it means it the node has no
children. One special case is however a thread with:
[1:dummy] [2:dummy] [3:ref 1, 2] [4:ref 2]
When 3 is removed, 1's refcount drops to zero but 2 is still referenced
by 4. In this case the 2's parent must be updated.
When we open the mail-hash, we check that no messages have been expunged
from mailbox which haven't also been removed from the hash. If they have,
we rebuild the thread tree. Otherwise we add the new messages to the hash
and then send the results to client.
*/
#include "common.h"
#include "array.h"
#include "crc32.h"
#include "ostream.h"
#include "str.h"
#include "message-id.h"
#include "imap-base-subject.h"
#include "mail-storage.h"
#include "mail-search.h"
#include "imap-thread.h"
/* FIXME: the mailbox accessing API needs some cleaning up. we shouldn't be
including all kinds of private headers */
#include "../lib-storage/index/index-storage.h"
#include "mail-hash.h"
#include "mail-index-private.h"
#include "mail-index-sync-private.h"
#include <stdlib.h>
#define IMAP_THREAD_CONTEXT(obj) \
/* how much memory to allocate initially. these are very rough
approximations. */
#define APPROX_MSG_EXTRA_COUNT 10
#define APPROX_MSGID_SIZE 45
/* Try to buffer this much data before sending it to output stream. */
#define OUTPUT_BUF_SIZE 2048
#define HDR_MESSAGE_ID "message-id"
#define HDR_IN_REPLY_TO "in-reply-to"
#define HDR_REFERENCES "references"
#define HDR_SUBJECT "subject"
struct msgid_rec {
const char *msgid;
};
struct mail_thread_rec {
struct mail_hash_record rec;
};
struct mail_thread_moved_rec {
struct mail_thread_rec rec;
/* first_child_idx and its siblings are in moved_recs */
unsigned int moved_children:1;
};
struct mail_thread_root_rec {
/* a linked list of roots which belong to the same thread */
struct mail_thread_root_rec *next;
/* another record has this record in its next-pointer.
this record isn't a root anymore. */
unsigned int nonroot:1;
/* idx points to moved_recs */
unsigned int moved:1;
/* subject contained a Re: or Fwd: */
unsigned int reply:1;
};
#define ROOT_REC_IS_DUMMY(rec) \
struct thread_context {
struct mail_search_context *search_ctx;
struct mailbox_transaction_context *t;
struct mail_thread_rec tmp_rec;
struct mail_search_arg tmp_search_arg;
struct mail_search_seqset seqset;
/* Hash record idx -> Message-ID */
ARRAY_DEFINE(msgid_map, const char *);
struct mail_hash *msgid_hash;
struct hash_table *subject_hash;
unsigned int cmp_match_count;
unsigned int id_is_uid:1;
unsigned int failed:1;
};
struct imap_thread_mailbox {
struct mailbox_vfuncs super;
struct mail_hash *msgid_hash;
/* set only temporarily while needed */
struct thread_context *ctx;
};
static unsigned int imap_thread_storage_module_id;
struct thread_context *ctx)
{
else
}
}
{
}
static int
const struct mail_thread_rec **rec_r)
{
const void *value;
return -1;
}
return 0;
}
static unsigned int mail_thread_hash_key(const void *key)
{
return key_rec->msgid_crc32;
}
static int
{
int ret;
return -1;
if (ret == 0) {
return 0;
}
found_msgid = msgid;
}
*msgid_r = found_msgid;
return 0;
}
if (found_msgid != NULL) {
/* hash collisions, we can't figure this out */
return -1;
}
found_msgid = msgid;
}
}
*msgid_r = found_msgid;
return 0;
}
static const char *
const struct mail_thread_rec *parent_rec)
{
const struct mail_thread_rec *child_rec;
const char *msgid;
return NULL;
continue;
/* each non-dummy child must have a valid In-Reply-To or
References header pointing to the parent, otherwise it
wouldn't be our child */
if (parent_rec->msgid_crc32 == 0) {
"msgid_crc32=0 node has children");
return NULL;
}
&msgid) == 0)
return msgid;
}
return NULL;
}
static const char *
{
if (*p != NULL)
return *p;
/* we can get the Message-ID directly */
return NULL;
return NULL;
} else {
/* fallback to finding from children's references */
}
return NULL;
return *p;
}
void *context)
{
const char *msgid;
return FALSE;
ctx->cmp_match_count++;
/* either a match or a collision, need to look closer */
/* we couldn't figure out the Message-ID for whatever reason.
we'll need to fallback to rebuilding the whole thread */
return FALSE;
}
}
static unsigned int mail_thread_hash_rec(const void *p)
{
const struct mail_thread_rec *rec = p;
return rec->msgid_crc32;
}
static int
struct mail_search_arg *search_args)
{
struct mailbox_status status;
const struct mail_hash_header *hdr;
unsigned int count;
return -1;
/* each search condition requires their own separate thread
index. for now we don't bother to support anything else than
"search all" threading, since that's what pretty much all
the clients do anyway. */
}
}
/* we want to build it in memory */
/* view is a bit out of date, can't optimize */
} else {
/* rebuild */
}
/* non-empty hash. add only the new messages in there. */
return -1;
}
/* some messages have been expunged. have to rebuild. */
} else {
/* after all these checks, this is the only case we
can actually optimize. */
hdr->message_count) < 0)
}
}
/* fallback to using in-memory hash */
struct index_mailbox *ibox =
ctx->msgid_hash =
sizeof(struct mail_thread_rec), 0,
tbox);
}
/* initialize searching */
search_args, NULL);
/* at this point the hash is either locked or we're using in-memory
hash where it doesn't matter */
ctx->msgid_pool =
return 0;
}
{
static const char *wanted_headers[] = {
};
struct imap_thread_mailbox *tbox =
struct mailbox_header_lookup_ctx *headers_ctx;
struct thread_context *ctx;
if (type != MAIL_THREAD_REFERENCES)
i_fatal("Only REFERENCES threading supported");
ret = 0;
ret = -1;
break;
}
while (ret == 0 &&
t_push();
t_pop();
}
if (mail_thread_finish(ctx) < 0)
ret = -1;
if (mailbox_transaction_commit(&ctx->t, 0) < 0)
ret = -1;
/* try again with in-memory hash */
} else {
break;
}
}
return ret;
}
static int
const struct mail_thread_rec *parent_rec)
{
const struct mail_thread_rec *child_rec;
struct mail_thread_rec tmp_rec;
return -1;
return -1;
}
}
return 0;
}
static int
{
const struct mail_thread_rec *rec;
struct mail_thread_rec tmp_rec;
return -1;
return -1;
}
/* last reference to the node, remove it completely.
it may however still have children, so update their
parents */
return -1;
if (rec->parent_idx != 0) {
return -1;
}
return -1;
return -1;
}
return 0;
} else {
return -1;
}
return 1;
}
}
static int
const struct mail_thread_rec *rec,
const struct mail_thread_rec **nondummy_rec_r)
{
return -1;
}
*nondummy_rec_r = rec;
return 0;
}
const struct mail_thread_rec *rec1,
const struct mail_thread_rec *rec2)
{
/* assume that rec2 is already non-dummy */
return 1;
}
/* use the sent date as long as both of the dates are valid */
/* otherwise fallback to comparing UIDs.
put dummy records to end of list */
return -1;
}
static int
{
struct mail_thread_rec tmp_rec;
const struct mail_thread_rec *rec;
return -1;
if (idx == parent_idx) {
/* first child */
} else {
/* sibling */
}
}
static int
const struct mail_thread_rec *child_rec,
const struct mail_thread_rec *parent_rec,
bool remove_existing)
{
struct mail_thread_rec tmp_rec;
if (child_rec->parent_idx == 0) {
/* we're not sorting root nodes */
return 0;
}
if (parent_rec == NULL) {
/* not given, have to look up */
&parent_rec) < 0)
return -1;
}
if (parent_rec->first_child_idx == 0) {
/* this is the first child */
tmp_rec = *parent_rec;
&tmp_rec) < 0) {
return -1;
}
} else {
/* find the position where we want it inserted */
return -1;
return -1;
/* unlink the child from here */
if (found)
return -1;
child_rec->parent_idx) < 0)
return -1;
} else {
}
if (idx == 0)
break;
return -1;
}
/* already in the right position */
return found ? -1 : 0;
}
/* insert into this position */
child_rec->parent_idx) < 0)
return -1;
/* update the child's next_idx */
&tmp_rec) < 0)
return -1;
}
if (remove_existing && !found) {
/* go through the rest and remove the existing child */
if (idx == 0) {
/* should have been found */
return -1;
}
return -1;
}
child_rec->parent_idx) < 0)
return -1;
}
}
return -1;
return 0;
/* parent is a dummy and we updated its first child.
that might move the parent */
}
{
struct mail_thread_rec rec;
const struct mail_thread_rec *recp;
const void *value;
int ret;
return -1;
}
return 0;
}
return -1;
}
if (ret == 0) {
/* first time we see this message */
return -1;
}
return 0;
}
/* seen before in references */
const char **p;
return -1;
}
return -1;
if (*p == NULL)
} else {
/* duplicate */
struct mail_thread_rec orig_rec;
rec.msgid_crc32 = 0;
return -1;
}
/* if the original message gets expunged, the thread tree must
be rebuilt. */
&orig_rec) < 0) {
return -1;
}
}
return 0;
}
{
struct mail_thread_rec rec;
return -1;
if (child_rec->parent_idx == 0)
return 0;
return -1;
/* find the node which links to this child */
sibling_idx = idx;
return -1;
}
return -1;
return -1;
/* remove from old parent's children list */
rec = *sibling_rec;
if (sibling_idx != parent_idx)
else
return -1;
}
/* parent is a dummy and we removed its first child.
this might move the parent */
return -1;
}
/* update the child's parent. it's added to new parent's children list
elsewhere. since this node was originally elsewhere as a dummy,
expunging this node will need to move it back there. */
return -1;
}
return 0;
}
static bool
{
while (parent_rec->parent_idx != 0) {
return TRUE;
&parent_rec) < 0)
return TRUE;
}
return FALSE;
}
{
const struct mail_thread_rec *rec;
struct mail_thread_rec tmp_rec;
while (idx != 0) {
return 1;
if (!rec->expunge_rebuilds) {
&tmp_rec) < 0) {
return -1;
}
}
}
return 0;
}
static int
{
struct mail_thread_rec tmp_rec;
const void *value;
int ret;
return -1;
}
if (ret > 0) {
&tmp_rec) < 0) {
return -1;
}
return 0;
}
/* not found, create */
return -1;
}
return 0;
}
static int
{
struct mail_thread_rec tmp_rec;
return -1;
/* create the msgid even if we don't use it. it's important because */
&parent_idx, &parent_rec) < 0)
return -1;
/* already got a parent, don't want to replace it.
if the old parent gets expunged, we'll need a rebuild */
tmp_rec = *parent_rec;
&tmp_rec) < 0) {
return -1;
}
return 0;
}
/* look up again, since the pointer may have been invalidated if
the parent_rec was inserted into hash */
return -1;
/* already have this parent, ignore */
return 0;
}
if (parent_idx == child_idx ||
return -1;
/* this would create a loop, not allowed. if any of the
parents get expunged, the loop gets removed and we'll
need a rebuild */
return -1;
return 0;
}
/* set new parent */
if (child_rec->parent_idx != 0) {
return -1;
} else {
&tmp_rec) < 0) {
return -1;
}
}
/* increase parent's refcount */
tmp_rec = *parent_rec;
return -1;
}
/* add to parent's child list */
parent_rec, FALSE);
}
const char *parent_msgid, const char *child_msgid)
{
return -1;
return -1;
return 0;
}
const char *references)
{
const char *parent_msgid, *child_msgid;
if (parent_msgid == NULL)
return 0;
return -1;
}
/* link the last message to us */
return -1;
return 1;
}
{
int ret;
sent_date = 0;
return -1;
/* link references */
if (ret < 0)
return -1;
if (ret == 0) {
return -1;
} else {
/* no references, make sure it's not linked */
return -1;
}
}
return 0;
}
static int
{
const struct mail_thread_moved_rec *mrec;
if (*moved) {
return 0;
} else {
}
}
const struct mail_thread_rec *rec,
{
bool sub_moved;
int ret;
while (idx != 0) {
return -1;
return 1;
}
if (rec->first_child_idx != 0) {
if (ret != 0)
return ret;
}
}
return 0;
}
const struct mail_thread_rec *rec,
bool children_moved)
{
}
static int
const struct mail_thread_rec *rec,
{
bool children_moved;
int ret;
while (idx != 0) {
&rec) < 0)
return -1;
break;
if (rec->first_child_idx != 0) {
/* does it have non-dummy children? */
if (ret < 0)
return -1;
if (ret > 0)
break;
}
}
return 0;
}
struct mail_thread_root_rec *rec)
{
struct mail_thread_root_rec *hash_rec;
char *hash_subject;
bool is_reply_or_forward;
return;
if (*subject == '\0')
return;
} else {
hash_subject = key;
if (!ROOT_REC_IS_DUMMY(hash_rec) &&
(ROOT_REC_IS_DUMMY(rec) ||
} else {
}
}
}
{
struct mail_thread_root_rec **roots;
const struct mail_thread_rec *rec;
const char *subject;
unsigned int i, count;
int ret;
ctx->subject_pool =
pool_alloconly_create("base subjects",
ctx->subject_hash =
for (i = 0; i < count; i++) {
} else {
/* find the first non-dummy child */
return -1;
if (ret < 0)
return -1;
if (ret == 0) {
/* a complete dummy thread, skip it */
continue;
}
return -1;
}
t_push();
t_pop();
}
}
return 0;
}
static int
struct mail_thread_root_rec *r2)
{
/* use the sent date as long as both of the dates are valid */
/* otherwise fallback to comparing UIDs */
}
unsigned int parent_mrec_idx,
struct mail_thread_moved_rec *child_mrec,
{
bool children_moved;
return -1;
&rec) < 0)
return -1;
return -1;
break;
}
if (prev_idx == 0) {
/* insert as first */
} else {
}
return 0;
}
unsigned int parent_mrec_idx,
const struct mail_thread_root_rec *parent_rrec)
{
const struct mail_thread_rec *rec;
struct mail_thread_moved_rec *mrec;
&children_moved, &rec) < 0)
return -1;
}
unsigned int parent_mrec_idx,
const struct mail_thread_rec *parent_rec)
{
const struct mail_thread_rec *rec;
struct mail_thread_moved_rec *mrec;
return -1;
return -1;
}
return 0;
}
struct mail_thread_root_rec *rrec)
{
struct mail_thread_root_rec *next;
const struct mail_thread_rec *rec;
struct mail_thread_moved_rec *mrec;
return -1;
if (!ROOT_REC_IS_DUMMY(rrec)) {
/* the record has the correct date already. the following
messages that have the reply-flag set will be children
of this record. */
/* move the parent */
return -1;
}
}
/* if there are more messages, they'll be siblings to this
record, and a dummy root will be added. */
/* create dummy */
}
}
} else {
/* add the rest of the records as children to this dummy */
if (!ROOT_REC_IS_DUMMY(next))
else {
&rec) < 0)
return -1;
return -1;
}
}
}
return 0;
}
{
struct mail_thread_root_rec **roots;
unsigned int i, count;
for (i = 0; i < count; i++) {
return -1;
}
}
return 0;
}
{
/* move the nonroots to the end of the array. we just want to get
rid of them. */
return -1;
}
{
struct mail_thread_root_rec **roots;
unsigned int count;
so we can compare them directly. the first non-dummy node is also
the oldest one as the children lists are always kept sorted */
}
{
return -1;
}
return 0;
}
bool moved)
{
const struct mail_thread_rec *rec;
bool children_moved;
int ret;
/* FIXME: there could be some more sanity checks, for example verify
that nodes are returned sorted */
return -1;
return -1;
if (next_idx == 0) {
/* no siblings - special case to avoid extra paranthesis */
/* dummy node, just skip this */
if (rec->first_child_idx != 0) {
}
return 0;
}
children_moved)) < 0)
return -1;
return -1;
if (ret != 0) {
}
return 0;
}
for (;;) {
if (STR_NEED_FLUSH(str, 0)) {
/* string getting full, flush it */
return -1;
str_truncate(str, 0);
}
return -1;
if (ret == 0) {
/* only child */
return -1;
}
/* dummy with children */
} else {
/* node with children */
return -1;
}
return -1;
if (idx == 0)
break;
&children_moved, &rec) < 0)
return -1;
}
return 1;
}
struct mail_thread_root_rec *root)
{
const struct mail_thread_rec *rec;
const struct mail_thread_moved_rec *mrec;
/* string getting full, flush it */
return -1;
str_truncate(str, 0);
}
} else {
return -1;
}
return -1;
}
if (rec->first_child_idx != 0) {
return -1;
}
else {
/* just a bunch of dummy nodes */
}
return 0;
}
{
struct mail_thread_root_rec *const *roots;
unsigned int i, count;
/* sort root nodes again, they have been modified since the last time */
for (i = 0; i < count; i++) {
/* nonroots are last in the list */
break;
}
return -1;
}
return 0;
}
{
const struct mail_hash_header *hdr;
const struct mail_thread_rec *rec;
struct mail_thread_root_rec *root_rec;
return -1;
if (hdr->record_count == 0)
return 0;
/* (2) save root nodes */
return -1;
so that the roots can be directly sorted */
return -1;
struct mail_thread_root_rec, 1);
}
}
/* (4) */
/* (5) Gather together messages under the root that have the same
base subject text. */
if (gather_base_subjects(ctx) < 0)
return -1;
/* (5.C) Merge threads with the same thread subject. */
if (merge_subject_threads(ctx) < 0)
return -1;
/* (6) Sort again and send replies */
t_push();
t_pop();
return 0;
}
static int
const struct mail_thread_rec **rec_r)
{
const void *value;
const char *message_id;
return -1;
if (message_id == NULL)
return 0;
return 0;
return -1;
/* duplicate Message-ID probably */
return -1;
}
return 1;
}
{
const char *references, *msgid;
const void *value;
int ret;
if (references == NULL)
return 0;
return 0;
t_push();
/* tmp_mail may be changed below, so we have to save the
references string */
do {
ctx->cmp_match_count = 0;
ctx->cmp_last_idx = 0;
break;
}
if (ret == 0) {
/* there's only one key with this crc32 value, so it
must be what we're looking for */
}
break;
t_pop();
}
const struct mail_thread_rec *rec)
{
struct mail_thread_rec tmp_rec;
return -1;
}
if (rec->parent_idx != 0) {
/* since our sent_date got removed, we may need to be moved */
return -1;
}
return 1;
}
static int
{
const struct mail_thread_rec *rec;
int ret;
/* init */
struct mail_index_transaction *t;
struct mailbox_transaction_context *mt;
return 0;
mt = MAIL_STORAGE_TRANSACTION(t);
ctx->msgid_pool =
/* deinit */
}
return 0;
} else {
/* locking had failed */
return 0;
}
return 0;
}
return 0;
if (ret < 0)
return 0;
/* We have a parent but no References header, so it means
there's In-Reply-To. Don't bother verifying this, just
unreference the parent. */
return 0;
}
/* now unreference the expunged message itself */
return 0;
} else {
return 0;
}
return 0;
}
{
int ret;
return ret;
}
{
tbox->msgid_hash =
sizeof(struct mail_thread_rec), 0,
tbox);
return;
}
static struct mailbox_sync_context *
{
struct mailbox_sync_context *ctx;
/* we don't want to get back here */
}
return ctx;
}
{
struct imap_thread_mailbox *tbox;
if (next_hook_mailbox_opened != NULL)
else {
/* delayed opening used. we want to try to open the hash
anyway, because if syncing expunges anything and we didn't
notice it, we would have to rebuild the hash */
}
}
void imap_thread_init(void)
{
}
void imap_thread_deinit(void)
{
}