src/lib-mail/message-header-encode.c

	message-header-encode.c revision a7dcd5b93e608feeffe980e268eac6c5b9f5b21c
/* Copyright (c) 2009-2014 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "str.h"
#include "base64.h"
#include "message-header-encode.h"

#define MIME_WRAPPER_LEN (strlen("=?utf-8?q?""?="))
#define MIME_MAX_LINE_LEN 76

#define IS_LWSP(c) \
    ((c) == ' ' || (c) == '\t' || (c) == '\n')

static bool input_idx_need_encoding(const unsigned char *input,
                    unsigned int i, unsigned int len)
{
    switch (input[i]) {
    case '\r':
        if (i+1 == len || input[i+1] != '\n')
            return TRUE;
        i++;
        /* fall through and verify the LF as well */
    case '\n':
        if (i+1 == len) {
            /* trailing LF - we need to drop it */
            return TRUE;
        }
        if (input[i+1] != '\t' && input[i+1] != ' ') {
            /* LF not followed by whitespace - we need to
               add the whitespace */
            return TRUE;
        }
        break;
    case '\t':
        /* TAB doesn't need to be encoded */
        break;
    case '=':
        /* <LWSP>=? - we need to check backwards a bit to see if
           there is LWSP (note that we don't want to return TRUE for
           the LWSP itself yet, so we need to do this backwards
           check) */
        if ((i == 0 || IS_LWSP(input[i-1])) && i+2 <= len &&
            memcmp(input + i, "=?", 2) == 0)
            return TRUE;
        break;
    default:
        /* 8bit chars */
        if ((input[i] & 0x80) != 0)
            return TRUE;
        /* control chars */
        if (input[i] < 32)
            return TRUE;
        break;
    }
    return FALSE;
}

void message_header_encode_q(const unsigned char *input, unsigned int len,
                 string_t *output, unsigned int first_line_len)
{
    unsigned int i, line_len_left;

    line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN;

    if (first_line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) {
        str_append(output, "\n\t");
        line_len_left--;
    } else {
        line_len_left -= first_line_len;
    }

    str_append(output, "=?utf-8?q?");
    for (i = 0; i < len; i++) {
        if (line_len_left < 3) {
            /* if we're not at the beginning of an UTF8 character,
               go backwards until we are */
            while (i > 0 && (input[i] & 0xc0) == 0x80) {
                str_truncate(output, str_len(output)-3);
                i--;
            }
            str_append(output, "?=\n\t=?utf-8?q?");
            line_len_left = MIME_MAX_LINE_LEN -
                MIME_WRAPPER_LEN - 1;
        }
        switch (input[i]) {
        case ' ':
            str_append_c(output, '_');
            break;
        case '=':
        case '?':
        case '_':
            line_len_left -= 2;
            str_printfa(output, "=%02X", input[i]);
            break;
        default:
            if (input[i] < 32 || (input[i] & 0x80) != 0) {
                line_len_left -= 2;
                str_printfa(output, "=%02X", input[i]);
            } else {
                str_append_c(output, input[i]);
            }
            break;
        }
        line_len_left--;
    }
    str_append(output, "?=");
}

void message_header_encode_b(const unsigned char *input, unsigned int len,
                 string_t *output, unsigned int first_line_len)
{
    unsigned int line_len, line_len_left, max;

    line_len = first_line_len;
    if (line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN) {
        str_append(output, "\n\t");
        line_len = 1;
    }

    for (;;) {
        line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - line_len;
        max = MAX_BASE64_DECODED_SIZE(line_len_left);
        do {
            max--;
            if (max > len)
                max = len;
            else {
                /* all of it doesn't fit. find a character where we
                   can split it from. */
                while (max > 0 && (input[max] & 0xc0) == 0x80)
                    max--;
            }
        } while (MAX_BASE64_ENCODED_SIZE(max) > line_len_left &&
             max > 0);

        if (max > 0) {
            str_append(output, "=?utf-8?b?");
            base64_encode(input, max, output);
            str_append(output, "?=");
        }

        input += max;
        len -= max;

        if (len == 0)
            break;

        str_append(output, "\n\t");
        line_len = 1;
    }
}

void message_header_encode(const char *input, string_t *output)
{
    message_header_encode_data((const void *)input, strlen(input), output);
}

void message_header_encode_data(const unsigned char *input, unsigned int len,
                string_t *output)
{
    unsigned int i, j, first_line_len, cur_line_len, last_idx;
    unsigned int enc_chars, enc_len, base64_len, q_len;
    const unsigned char *next_line_input;
    unsigned int next_line_len;
    bool use_q, cr;

    /* find the first word that needs encoding */
    for (i = 0; i < len; i++) {
        if (input_idx_need_encoding(input, i, len))
            break;
    }
    if (i == len) {
        /* no encoding necessary */
        str_append_data(output, input, len);
        return;
    }
    /* go back to the beginning of the word so it is fully encoded */
    if (input[i] != '\r' && input[i] != '\n') {
        while (i > 0 && !IS_LWSP(input[i-1]))
            i--;
    }

    /* write the prefix */
    str_append_data(output, input, i);
    first_line_len = j = i;
    while (j > 0 && input[j-1] != '\n') j--;
    if (j != 0)
        first_line_len = j;

    input += i;
    len -= i;

    /* we'll encode data only up to the next LF, the rest is handled
       recursively. */
    next_line_input = memchr(input, '\n', len);
    if (next_line_input != NULL) {
        if (next_line_input != input && next_line_input[-1] == '\r')
            next_line_input--;
        cur_line_len = next_line_input - input;
        next_line_len = len - cur_line_len;
        len = cur_line_len;
    }

    /* find the last word that needs encoding */
    last_idx = 0; enc_chars = 0;
    for (i = 0; i < len; i++) {
        if (input_idx_need_encoding(input, i, len)) {
            last_idx = i + 1;
            enc_chars++;
        }
    }
    while (last_idx < len && !IS_LWSP(input[last_idx]))
        last_idx++;

    /* figure out if we should use Q or B encoding. Prefer Q if it's not
       too much larger. */
    enc_len = last_idx;
    base64_len = MAX_BASE64_ENCODED_SIZE(enc_len);
    q_len = enc_len + enc_chars*3;
    use_q = q_len*2/3 <= base64_len;

    /* and do it */
    if (enc_len == 0)
        ;
    else if (use_q)
        message_header_encode_q(input, enc_len, output, first_line_len);
    else
        message_header_encode_b(input, enc_len, output, first_line_len);
    str_append_data(output, input + last_idx, len - last_idx);

    if (next_line_input != NULL) {
        /* we're at [CR]LF */
        i = 0;
        if (next_line_input[0] == '\r') {
            cr = TRUE;
            i++;
        }
        i_assert(next_line_input[i] == '\n');
        if (++i == next_line_len)
            return; /* drop trailing [CR]LF */

        if (cr)
            str_append_c(output, '\r');
        str_append_c(output, '\n');

        if (next_line_input[i] == ' ' || next_line_input[i] == '\t') {
            str_append_c(output, next_line_input[i]);
            i++;
        } else {
            /* make it valid folding whitespace by adding a TAB */
            str_append_c(output, '\t');
        }
        message_header_encode_data(next_line_input+i, next_line_len-i,
                       output);
    }
}