src/lib-imap/imap-base-subject.c

/* Copyright (c) 2002-2018 Dovecot authors, see the included COPYING file */

/* Implemented against draft-ietf-imapext-sort-10 and
   draft-ietf-imapext-thread-12 */

#include "lib.h"
#include "buffer.h"
#include "charset-utf8.h"
#include "message-header-decode.h"
#include "imap-base-subject.h"

static void pack_whitespace(buffer_t *buf)
{
    char *data, *dest;
    bool last_lwsp;

    data = buffer_get_modifiable_data(buf, NULL);

    /* check if we need to do anything */
    while (*data != '\0') {
        if (*data == '\t' || *data == '\n' || *data == '\r' ||
            (*data == ' ' && (data[1] == ' ' || data[1] == '\t')))
            break;
        data++;
    }

    if (*data == '\0')
        return;

    /* @UNSAFE: convert/pack the whitespace */
    dest = data; last_lwsp = FALSE;
    while (*data != '\0') {
        if (*data == '\t' || *data == ' ' ||
            *data == '\r' || *data == '\n') {
            if (!last_lwsp) {
                *dest++ = ' ';
                last_lwsp = TRUE;
            }
        } else {
            *dest++ = *data;
            last_lwsp = FALSE;
        }
        data++;
    }
    *dest = '\0';

    data = buffer_get_modifiable_data(buf, NULL);
    buffer_set_used_size(buf, (size_t) (dest - data)+1);
}

static void remove_subj_trailers(buffer_t *buf, size_t start_pos,
                 bool *is_reply_or_forward_r)
{
    const char *data;
    size_t orig_size, size;

    /* subj-trailer    = "(fwd)" / WSP */
    data = buffer_get_data(buf, &orig_size);

    if (orig_size < 1) /* size includes trailing \0 */
        return;

    for (size = orig_size-1; size > start_pos; ) {
        if (data[size-1] == ' ')
            size--;
        else if (size >= 5 &&
             memcmp(data + size - 5, "(FWD)", 5) == 0) {
            *is_reply_or_forward_r = TRUE;
            size -= 5;
        } else {
            break;
        }
    }

    if (size != orig_size-1) {
        buffer_set_used_size(buf, size);
        buffer_append_c(buf, '\0');
    }
}

static bool remove_blob(const char **datap)
{
    const char *data = *datap;

    if (*data != '[')
        return FALSE;

    data++;
    while (*data != '\0' && *data != '[' && *data != ']')
        data++;

    if (*data != ']')
        return FALSE;

    data++;
    if (*data == ' ')
        data++;

    *datap = data;
    return TRUE;
}

static bool remove_subj_leader(buffer_t *buf, size_t *start_pos,
                   bool *is_reply_or_forward_r)
{
    const char *data, *orig_data;
    bool ret = FALSE;

    /* subj-leader     = (*subj-blob subj-refwd) / WSP

       subj-blob       = "[" *BLOBCHAR "]" *WSP
       subj-refwd      = ("re" / ("fw" ["d"])) *WSP [subj-blob] ":"

       BLOBCHAR        = %x01-5a / %x5c / %x5e-7f
                       ; any CHAR except '[' and ']' */
    orig_data = buf->data;
    orig_data += *start_pos;
    data = orig_data;

    if (*data == ' ') {
        /* independent from checks below - always removed */
        data++; orig_data++;
        *start_pos += 1;
        ret = TRUE;
    }

    while (*data == '[') {
        if (!remove_blob(&data))
            return ret;
    }

    if (strncmp(data, "RE", 2) == 0)
        data += 2;
    else if (strncmp(data, "FWD", 3) == 0)
        data += 3;
    else if (strncmp(data, "FW", 2) == 0)
        data += 2;
    else
        return ret;

    if (*data == ' ')
        data++;

    if (*data == '[' && !remove_blob(&data))
        return ret;

    if (*data != ':')
        return ret;

    data++;
    *start_pos += (size_t)(data - orig_data);
    *is_reply_or_forward_r = TRUE;
    return TRUE;
}

static bool remove_blob_when_nonempty(buffer_t *buf, size_t *start_pos)
{
    const char *data, *orig_data;

    orig_data = buf->data;
    orig_data += *start_pos;
    data = orig_data;
    if (*data == '[' && remove_blob(&data) && *data != '\0') {
        *start_pos += (size_t)(data - orig_data);
        return TRUE;
    }

    return FALSE;
}

static bool remove_subj_fwd_hdr(buffer_t *buf, size_t *start_pos,
                bool *is_reply_or_forward_r)
{
    const char *data = buf->data;
    size_t size = buf->used;

    /* subj-fwd        = subj-fwd-hdr subject subj-fwd-trl
       subj-fwd-hdr    = "[fwd:"
       subj-fwd-trl    = "]" */

    if (strncmp(data + *start_pos, "[FWD:", 5) != 0)
        return FALSE;

    if (data[size-2] != ']')
        return FALSE;

    *is_reply_or_forward_r = TRUE;

    buffer_set_used_size(buf, size-2);
    buffer_append_c(buf, '\0');

    *start_pos += 5;
    return TRUE;
}

const char *imap_get_base_subject_cased(pool_t pool, const char *subject,
                    bool *is_reply_or_forward_r)
{
    buffer_t *buf;
    size_t start_pos, subject_len;
    bool found;

    *is_reply_or_forward_r = FALSE;

    subject_len = strlen(subject);
    buf = buffer_create_dynamic(pool, subject_len);

    /* (1) Convert any RFC 2047 encoded-words in the subject to
       UTF-8.  Convert all tabs and continuations to space.
       Convert all multiple spaces to a single space. */
    message_header_decode_utf8((const unsigned char *)subject, subject_len,
                   buf, uni_utf8_to_decomposed_titlecase);
    buffer_append_c(buf, '\0');

    pack_whitespace(buf);

    start_pos = 0;
    do {
        /* (2) Remove all trailing text of the subject that matches
           the subj-trailer ABNF, repeat until no more matches are
           possible. */
        remove_subj_trailers(buf, start_pos, is_reply_or_forward_r);

        do {
            /* (3) Remove all prefix text of the subject that
               matches the subj-leader ABNF. */
            found = remove_subj_leader(buf, &start_pos,
                           is_reply_or_forward_r);

            /* (4) If there is prefix text of the subject that
               matches the subj-blob ABNF, and removing that prefix
               leaves a non-empty subj-base, then remove the prefix
               text. */
            found = remove_blob_when_nonempty(buf, &start_pos) ||
                found;

            /* (5) Repeat (3) and (4) until no matches remain. */
        } while (found);

        /* (6) If the resulting text begins with the subj-fwd-hdr ABNF
           and ends with the subj-fwd-trl ABNF, remove the
           subj-fwd-hdr and subj-fwd-trl and repeat from step (2). */
    } while (remove_subj_fwd_hdr(buf, &start_pos, is_reply_or_forward_r));

    /* (7) The resulting text is the "base subject" used in the
       SORT. */
    return (const char *)buf->data + start_pos;
}