message-decoder.c revision 373745850a588cb7ebdadf8bf2f78b1b6529b98f
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen/* Copyright (C) 2006 Timo Sirainen */
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen#include "lib.h"
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen#include "buffer.h"
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen#include "strescape.h"
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen#include "base64.h"
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen#include "charset-utf8.h"
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen#include "quoted-printable.h"
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen#include "message-parser.h"
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen#include "message-content-parser.h"
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen#include "message-header-decode.h"
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen#include "message-decoder.h"
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainenenum content_type {
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen CONTENT_TYPE_UNKNOWN = 0,
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen CONTENT_TYPE_BINARY,
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen CONTENT_TYPE_QP,
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen CONTENT_TYPE_BASE64
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen};
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen/* Both base64 and q-p takes max 3 bytes per character */
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen#define MAX_ENCODING_BUF_SIZE 2
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen/* UTF-8 takes max 5 bytes per character. Not sure about others, but I'd think
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen 10 is more than enough for everyone.. */
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen#define MAX_TRANSLATION_BUF_SIZE 10
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainenstruct message_decoder_context {
b162e16ade1e0d9bfae62e366caf57a3132f5963Timo Sirainen struct message_header_line hdr;
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen buffer_t *buf, *buf2;
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen struct charset_translation *charset_trans;
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen char translation_buf[MAX_TRANSLATION_BUF_SIZE];
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen unsigned int translation_size;
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen char encoding_buf[MAX_ENCODING_BUF_SIZE];
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen unsigned int encoding_size;
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen char *content_charset;
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen enum content_type content_type;
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen unsigned int charset_utf8:1;
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen};
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainenstruct message_decoder_context *message_decoder_init(void)
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen{
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen struct message_decoder_context *ctx;
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
b162e16ade1e0d9bfae62e366caf57a3132f5963Timo Sirainen ctx = i_new(struct message_decoder_context, 1);
b162e16ade1e0d9bfae62e366caf57a3132f5963Timo Sirainen ctx->buf = buffer_create_dynamic(default_pool, 8192);
b162e16ade1e0d9bfae62e366caf57a3132f5963Timo Sirainen ctx->buf2 = buffer_create_dynamic(default_pool, 8192);
b162e16ade1e0d9bfae62e366caf57a3132f5963Timo Sirainen return ctx;
b162e16ade1e0d9bfae62e366caf57a3132f5963Timo Sirainen}
b162e16ade1e0d9bfae62e366caf57a3132f5963Timo Sirainen
b162e16ade1e0d9bfae62e366caf57a3132f5963Timo Sirainenvoid message_decoder_deinit(struct message_decoder_context **_ctx)
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen{
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen struct message_decoder_context *ctx = *_ctx;
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen *_ctx = NULL;
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen buffer_free(ctx->buf);
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen buffer_free(ctx->buf2);
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen i_free(ctx);
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen}
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainenstatic bool
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainenmessage_decode_header_callback(const unsigned char *data, size_t size,
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen const char *charset, void *context)
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen{
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen struct message_decoder_context *ctx = context;
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen struct charset_translation *t;
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen bool unknown_charset;
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
baf3e87e186453fda13bd21f7cbcb2efc8492e8bTimo Sirainen if (charset == NULL || strcasecmp(charset, "UTF-8") == 0) {
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen /* ASCII */
b162e16ade1e0d9bfae62e366caf57a3132f5963Timo Sirainen buffer_append(ctx->buf, data, size);
96e1ed172ae59cae6bc6e6ddd24d22a158e23dfeTimo Sirainen return TRUE;
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen }
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen t = charset_to_utf8_begin(charset, &unknown_charset);
b2ff0468c298dd2f525fff5977f1f24fba3a9f3bTimo Sirainen if (unknown_charset) {
/* let's just ignore this part */
return TRUE;
}
/* ignore any errors */
(void)charset_to_ucase_utf8_full(t, data, &size, ctx->buf);
charset_to_utf8_end(&t);
return TRUE;
}
static void parse_content_encoding(const unsigned char *value, size_t value_len,
void *context)
{
struct message_decoder_context *ctx = context;
ctx->content_type = CONTENT_TYPE_UNKNOWN;
switch (value_len) {
case 4:
if (memcasecmp(value, "7bit", 4) == 0 ||
memcasecmp(value, "8bit", 4) == 0)
ctx->content_type = CONTENT_TYPE_BINARY;
break;
case 6:
if (memcasecmp(value, "base64", 6) == 0)
ctx->content_type = CONTENT_TYPE_BASE64;
else if (memcasecmp(value, "binary", 6) == 0)
ctx->content_type = CONTENT_TYPE_BINARY;
break;
case 16:
if (memcasecmp(value, "quoted-printable", 16) == 0)
ctx->content_type = CONTENT_TYPE_QP;
break;
}
}
static void
parse_content_type_param(const unsigned char *name, size_t name_len,
const unsigned char *value, size_t value_len,
bool value_quoted, void *context)
{
struct message_decoder_context *ctx = context;
if (name_len == 7 && memcasecmp(name, "charset", 7) == 0 &&
ctx->content_charset == NULL) {
ctx->content_charset = i_strndup(value, value_len);
if (value_quoted) str_unescape(ctx->content_charset);
ctx->charset_utf8 = charset_is_utf8(ctx->content_charset);
}
}
static bool message_decode_header(struct message_decoder_context *ctx,
struct message_header_line *hdr,
struct message_block *output)
{
if (hdr->continues) {
hdr->use_full_value = TRUE;
return FALSE;
}
if (hdr->name_len == 12 &&
strcasecmp(hdr->name, "Content-Type") == 0) {
message_content_parse_header(hdr->full_value,
hdr->full_value_len,
NULL,
parse_content_type_param, ctx);
}
if (hdr->name_len == 25 &&
strcasecmp(hdr->name, "Content-Transfer-Encoding") == 0) {
message_content_parse_header(hdr->full_value,
hdr->full_value_len,
parse_content_encoding,
NULL, ctx);
}
buffer_set_used_size(ctx->buf, 0);
message_header_decode(hdr->full_value, hdr->full_value_len,
message_decode_header_callback, ctx);
ctx->hdr = *hdr;
ctx->hdr.full_value = ctx->buf->data;
ctx->hdr.full_value_len = ctx->buf->used;
ctx->hdr.value_len = 0;
output->hdr = &ctx->hdr;
return TRUE;
}
static void translation_buf_decode(struct message_decoder_context *ctx,
const unsigned char **data, size_t *size)
{
unsigned char trans_buf[MAX_TRANSLATION_BUF_SIZE+1];
size_t pos, skip;
/* @UNSAFE */
memcpy(trans_buf, ctx->translation_buf, ctx->translation_size);
skip = sizeof(trans_buf) - ctx->translation_size;
if (skip > *size)
skip = *size;
memcpy(trans_buf + ctx->translation_size, data, skip);
pos = *size;
(void)charset_to_ucase_utf8_full(ctx->charset_trans,
*data, &pos, ctx->buf2);
i_assert(pos > ctx->translation_size);
skip = (ctx->translation_size + skip) - pos;
i_assert(*size >= skip);
*data += skip;
*size -= skip;
ctx->translation_size = 0;
}
static bool message_decode_body(struct message_decoder_context *ctx,
struct message_block *input,
struct message_block *output)
{
unsigned char new_buf[MAX_ENCODING_BUF_SIZE+1];
const unsigned char *data = NULL;
size_t pos, size = 0, skip = 0;
bool unknown_charset;
if (ctx->charset_trans == NULL && !ctx->charset_utf8) {
ctx->charset_trans =
charset_to_utf8_begin(ctx->content_charset != NULL ?
ctx->content_charset : "UTF-8",
&unknown_charset);
}
if (ctx->encoding_size != 0) {
/* @UNSAFE */
memcpy(new_buf, ctx->encoding_buf, ctx->encoding_size);
skip = sizeof(new_buf) - ctx->encoding_size;
if (skip > input->size)
skip = input->size;
memcpy(new_buf + ctx->encoding_size, input->data, skip);
}
switch (ctx->content_type) {
case CONTENT_TYPE_UNKNOWN:
/* just skip this body */
return FALSE;
case CONTENT_TYPE_BINARY:
data = input->data;
size = pos = input->size;
break;
case CONTENT_TYPE_QP:
buffer_set_used_size(ctx->buf, 0);
if (ctx->encoding_size != 0) {
quoted_printable_decode(new_buf,
ctx->encoding_size + skip,
&pos, ctx->buf);
i_assert(pos > ctx->encoding_size);
skip = (ctx->encoding_size + skip) - pos;
}
quoted_printable_decode(input->data, input->size,
&pos, ctx->buf);
pos += skip;
data = ctx->buf->data;
size = ctx->buf->used;
break;
case CONTENT_TYPE_BASE64:
buffer_set_used_size(ctx->buf, 0);
if (ctx->encoding_size != 0) {
if (base64_decode(new_buf, ctx->encoding_size + skip,
&pos, ctx->buf) < 0) {
/* corrupted base64 data, don't bother with
the rest of it */
return FALSE;
}
i_assert(pos > ctx->encoding_size);
skip = (ctx->encoding_size + skip) - pos;
}
if (base64_decode(input->data + skip, input->size - skip,
&pos, ctx->buf) < 0) {
/* corrupted base64 data, don't bother with
the rest of it */
return FALSE;
}
pos += skip;
data = ctx->buf->data;
size = ctx->buf->used;
break;
}
if (pos != input->size) {
/* @UNSAFE */
ctx->encoding_size = input->size - pos;
i_assert(ctx->encoding_size <= sizeof(ctx->encoding_buf));
memcpy(ctx->encoding_buf, input->data + pos,
ctx->encoding_size);
}
if (ctx->charset_utf8 || ctx->charset_trans == NULL) {
output->data = data;
output->size = size;
} else {
buffer_set_used_size(ctx->buf2, 0);
if (ctx->translation_size != 0)
translation_buf_decode(ctx, &data, &size);
pos = size;
(void)charset_to_ucase_utf8_full(ctx->charset_trans,
data, &pos, ctx->buf2);
if (pos != size) {
ctx->translation_size = size - pos;
i_assert(ctx->translation_size <=
sizeof(ctx->translation_buf));
memcpy(ctx->translation_buf, data + pos,
ctx->translation_size);
}
output->data = ctx->buf2->data;
output->size = ctx->buf2->used;
}
output->hdr = NULL;
return TRUE;
}
bool message_decoder_decode_next_block(struct message_decoder_context *ctx,
struct message_block *input,
struct message_block *output)
{
if (input->part != output->part) {
/* MIME part changed. */
i_free_and_null(ctx->content_charset);
ctx->content_type = CONTENT_TYPE_BINARY;
ctx->charset_utf8 = TRUE;
}
output->part = input->part;
if (input->hdr != NULL)
return message_decode_header(ctx, input->hdr, output);
else
return message_decode_body(ctx, input, output);
}