message-parser.c revision 5f6368ee09ba3edfb9b582199c2730a7b5517a10
/* Copyright (C) 2002 Timo Sirainen */
#include "lib.h"
#include "iobuffer.h"
#include "rfc822-tokenize.h"
#include "message-content-parser.h"
#include "message-parser.h"
#include "message-size.h"
typedef struct _MessageBoundary {
struct _MessageBoundary *next;
MessagePart *part;
const char *boundary;
unsigned int len;
} MessageBoundary;
typedef struct {
Pool pool;
MessagePart *part;
char *last_boundary;
char *last_content_type;
MessageBoundary *boundaries;
MessageHeaderFunc func;
void *context;
} MessageParseContext;
static MessagePart *message_parse_part(IOBuffer *inbuf,
MessageParseContext *parse_ctx);
static MessagePart *message_parse_body(IOBuffer *inbuf,
MessageBoundary *boundaries,
MessageSize *body_size);
static MessagePart *message_skip_boundary(IOBuffer *inbuf,
MessageBoundary *boundaries,
MessageSize *boundary_size);
static void message_size_add_part(MessageSize *dest, MessagePart *part)
{
dest->physical_size +=
part->header_size.physical_size +
part->body_size.physical_size;
dest->virtual_size +=
part->header_size.virtual_size +
part->body_size.virtual_size;
dest->lines += part->header_size.lines + part->body_size.lines;
}
static MessagePart *message_part_append(Pool pool, MessagePart *parent)
{
MessagePart *part, **list;
part = p_new(pool, MessagePart, 1);
part->parent = parent;
list = &part->parent->children;
while (*list != NULL)
list = &(*list)->next;
*list = part;
return part;
}
static void parse_content_type(const Rfc822Token *tokens, int count,
void *context)
{
MessageParseContext *parse_ctx = context;
const char *str;
if (tokens[0].token != 'A')
return;
if (parse_ctx->last_content_type != NULL)
return;
str = rfc822_tokens_get_value(tokens, count);
parse_ctx->last_content_type = p_strdup(parse_ctx->pool, str);
if (strcasecmp(str, "message/rfc822") == 0)
parse_ctx->part->flags |= MESSAGE_PART_FLAG_MESSAGE_RFC822;
else if (strncasecmp(str, "text/", 5) == 0)
parse_ctx->part->flags |= MESSAGE_PART_FLAG_TEXT;
else if (strncasecmp(str, "multipart/", 10) == 0) {
parse_ctx->part->flags |= MESSAGE_PART_FLAG_MULTIPART;
if (strcasecmp(str+10, "digest") == 0) {
parse_ctx->part->flags |=
MESSAGE_PART_FLAG_MULTIPART_DIGEST;
}
}
}
static void parse_content_type_param(const Rfc822Token *name,
const Rfc822Token *value,
int value_count, void *context)
{
MessageParseContext *parse_ctx = context;
const char *str;
if ((parse_ctx->part->flags & MESSAGE_PART_FLAG_MULTIPART) == 0 ||
name->len != 8 || strncasecmp(name->ptr, "boundary", 8) != 0)
return;
if (parse_ctx->last_boundary == NULL) {
str = rfc822_tokens_get_value(value, value_count);
parse_ctx->last_boundary = p_strdup(parse_ctx->pool, str);
}
}
static void parse_header_field(MessagePart *part,
const char *name, unsigned int name_len,
const char *value, unsigned int value_len,
void *context)
{
MessageParseContext *parse_ctx = context;
/* call the user-defined header parser */
if (parse_ctx->func != NULL) {
parse_ctx->func(part, name, name_len, value, value_len,
parse_ctx->context);
}
if (name_len == 12 && strncasecmp(name, "Content-Type", 12) == 0) {
/* we need to know the boundary */
(void)message_content_parse_header(t_strndup(value, value_len),
parse_content_type,
parse_content_type_param,
parse_ctx);
}
}
static MessagePart *message_parse_multipart(IOBuffer *inbuf,
MessageParseContext *parse_ctx)
{
MessagePart *parent_part, *next_part, *part;
MessageBoundary *b;
/* multipart message. add new boundary */
b = t_new(MessageBoundary, 1);
b->part = parse_ctx->part;
b->boundary = parse_ctx->last_boundary;
b->len = strlen(b->boundary);
b->next = parse_ctx->boundaries;
parse_ctx->boundaries = b;
/* reset fields */
parse_ctx->last_boundary = NULL;
parse_ctx->last_content_type = NULL;
/* skip the data before the first boundary */
parent_part = parse_ctx->part;
next_part = message_skip_boundary(inbuf, parse_ctx->boundaries,
&parent_part->body_size);
/* now, parse the parts */
while (next_part == parent_part) {
/* new child */
part = message_part_append(parse_ctx->pool, parent_part);
/* set child position */
part->physical_pos =
parent_part->physical_pos +
parent_part->body_size.physical_size +
parent_part->header_size.physical_size;
parse_ctx->part = part;
next_part = message_parse_part(inbuf, parse_ctx);
/* update our size */
message_size_add_part(&parent_part->body_size, part);
if (next_part != parent_part)
break;
/* skip the boundary */
next_part = message_skip_boundary(inbuf, parse_ctx->boundaries,
&parent_part->body_size);
}
/* remove boundary */
i_assert(parse_ctx->boundaries == b);
parse_ctx->boundaries = b->next;
return next_part;
}
static MessagePart *message_parse_part(IOBuffer *inbuf,
MessageParseContext *parse_ctx)
{
MessagePart *next_part, *part;
uoff_t hdr_size;
message_parse_header(parse_ctx->part, inbuf,
&parse_ctx->part->header_size,
parse_header_field, parse_ctx);
/* update message position/size */
hdr_size = parse_ctx->part->header_size.physical_size;
if (parse_ctx->last_boundary != NULL)
return message_parse_multipart(inbuf, parse_ctx);
if (parse_ctx->last_content_type == NULL) {
if (parse_ctx->part->parent != NULL &&
(parse_ctx->part->parent->flags &
MESSAGE_PART_FLAG_MULTIPART_DIGEST)) {
/* when there's no content-type specified and we're
below multipart/digest, the assume message/rfc822
content-type */
parse_ctx->part->flags |=
MESSAGE_PART_FLAG_MESSAGE_RFC822;
} else {
/* otherwise we default to text/plain */
parse_ctx->part->flags |= MESSAGE_PART_FLAG_TEXT;
}
}
parse_ctx->last_boundary = NULL;
parse_ctx->last_content_type = NULL;
if (parse_ctx->part->flags & MESSAGE_PART_FLAG_MESSAGE_RFC822) {
/* message/rfc822 part - the message body begins with
headers again, this works pretty much the same as
a single multipart/mixed item */
part = message_part_append(parse_ctx->pool, parse_ctx->part);
parse_ctx->part = part;
next_part = message_parse_part(inbuf, parse_ctx);
parse_ctx->part = part->parent;
/* our body size is the size of header+body in message/rfc822 */
message_size_add_part(&part->parent->body_size, part);
} else {
/* normal message, read until the next boundary */
part = parse_ctx->part;
next_part = message_parse_body(inbuf, parse_ctx->boundaries,
&part->body_size);
}
return next_part;
}
MessagePart *message_parse(Pool pool, IOBuffer *inbuf,
MessageHeaderFunc func, void *context)
{
MessagePart *part;
MessageParseContext parse_ctx;
memset(&parse_ctx, 0, sizeof(parse_ctx));
parse_ctx.pool = pool;
parse_ctx.func = func;
parse_ctx.context = context;
parse_ctx.part = part = p_new(pool, MessagePart, 1);
t_push();
message_parse_part(inbuf, &parse_ctx);
t_pop();
return part;
}
/* skip over to next line increasing message size */
static void message_skip_line(IOBuffer *inbuf, MessageSize *msg_size)
{
unsigned char *msg;
unsigned int i, size, startpos;
startpos = 0;
while (io_buffer_read_data(inbuf, &msg, &size, startpos) >= 0) {
for (i = startpos; i < size; i++) {
if (msg[i] == '\n') {
if (msg_size != NULL) {
if (i == 0 || msg[i-1] != '\r')
msg_size->virtual_size++;
msg_size->lines++;
}
break;
}
}
if (i < size) {
startpos = i+1;
break;
}
if (i > 0) {
/* leave the last character, it may be \r */
io_buffer_skip(inbuf, i - 1);
startpos = 1;
if (msg_size != NULL) {
msg_size->physical_size += i - 1;
msg_size->virtual_size += i - 1;
}
}
}
io_buffer_skip(inbuf, startpos);
if (msg_size != NULL) {
msg_size->physical_size += startpos;
msg_size->virtual_size += startpos;
}
}
void message_parse_header(MessagePart *part, IOBuffer *inbuf,
MessageSize *hdr_size,
MessageHeaderFunc func, void *context)
{
unsigned char *msg;
unsigned int i, size, startpos, missing_cr_count;
unsigned int line_start, colon_pos, end_pos, name_len, value_len;
int ret;
if (hdr_size != NULL)
memset(hdr_size, 0, sizeof(MessageSize));
missing_cr_count = startpos = line_start = 0;
colon_pos = UINT_MAX;
while ((ret = io_buffer_read_data(inbuf, &msg,
&size, startpos+1)) != -1) {
if (ret == -2) {
/* overflow, line is too long. just skip it. */
i_assert(size > 2);
message_skip_line(inbuf, hdr_size);
startpos = line_start = 0;
colon_pos = UINT_MAX;
continue;
}
if (size == 0) {
/* no, we never want empty buffer */
continue;
}
/* don't parse the last character, so we can always have
one character read-ahead. we never care about the last
character anyway, it's either the first character in
message body, or if there's no body for any reason, it's
the \n ending the header. */
size--;
for (i = startpos; i < size; i++) {
if (msg[i] == ':' && colon_pos == UINT_MAX) {
colon_pos = i;
continue;
}
if (msg[i] != '\n')
continue;
if (hdr_size != NULL)
hdr_size->lines++;
if (i == 0 || msg[i-1] != '\r') {
/* missing CR */
missing_cr_count++;
}
if (i == 0 || (i == 1 && msg[i-1] == '\r')) {
/* no headers at all */
break;
}
if ((i > 0 && msg[i-1] == '\n') ||
(i > 1 && msg[i-2] == '\n' && msg[i-1] == '\r')) {
/* \n\n or \n\r\n - end of headers */
break;
}
/* make sure the header doesn't continue to next line */
if (!IS_LWSP(msg[i+1])) {
if (colon_pos != UINT_MAX &&
colon_pos != line_start && func != NULL &&
!IS_LWSP(msg[line_start])) {
/* we have a valid header line */
/* get length of name-field */
end_pos = colon_pos-1;
while (end_pos > line_start &&
IS_LWSP(msg[end_pos]))
end_pos--;
name_len = end_pos - line_start + 1;
/* get length of value field */
colon_pos++;
while (colon_pos < i &&
IS_LWSP(msg[colon_pos]))
colon_pos++;
value_len = i - colon_pos;
if (msg[i-1] == '\r') value_len--;
/* and finally call the function */
func(part, msg + line_start, name_len,
msg + colon_pos, value_len,
context);
}
colon_pos = UINT_MAX;
line_start = i+1;
}
}
if (i < size) {
/* end of header */
startpos = i+1;
break;
}
if (i > 0) {
/* leave the last line to buffer */
if (colon_pos != UINT_MAX)
colon_pos -= line_start;
if (hdr_size != NULL)
hdr_size->physical_size += line_start;
io_buffer_skip(inbuf, line_start);
startpos = i-line_start;
line_start = 0;
}
}
io_buffer_skip(inbuf, startpos);
if (hdr_size != NULL) {
hdr_size->physical_size += startpos;
hdr_size->virtual_size +=
hdr_size->physical_size + missing_cr_count;
i_assert(hdr_size->virtual_size >= hdr_size->physical_size);
}
}
static MessageBoundary *boundary_find(MessageBoundary *boundaries,
const char *msg, unsigned int len)
{
while (boundaries != NULL) {
if (boundaries->len <= len &&
strncmp(boundaries->boundary, msg, boundaries->len) == 0)
return boundaries;
boundaries = boundaries->next;
}
return NULL;
}
/* read until next boundary is found. if skip_over = FALSE, stop at the
[\r]\n before the boundary, otherwise leave it right after the known
boundary so the ending "--" can be checked. */
static MessageBoundary *
message_find_boundary(IOBuffer *inbuf, MessageBoundary *boundaries,
MessageSize *msg_size, int skip_over)
{
MessageBoundary *boundary;
unsigned char *msg;
unsigned int i, size, startpos, line_start, missing_cr_count;
boundary = NULL;
missing_cr_count = startpos = line_start = 0;
while (io_buffer_read_data(inbuf, &msg, &size, startpos) >= 0) {
for (i = startpos; i < size; i++) {
if (msg[i] != '\n')
continue;
if (i > line_start+2 && msg[line_start] == '-' &&
msg[line_start+1] == '-') {
/* possible boundary */
boundary = boundary_find(boundaries,
msg + line_start + 2,
i - line_start - 2);
if (boundary != NULL)
break;
}
if (i == 0 || msg[i-1] != '\r') {
/* missing CR */
missing_cr_count++;
}
msg_size->lines++;
line_start = i+1;
}
if (boundary != NULL) {
/* boundary found */
break;
}
if (i > 0) {
if (i - line_start > 128 &&
msg[line_start] == '-' && msg[line_start+1] == '-') {
/* long partial line, see if it's a boundary.
RFC-2046 says that the boundaries must be
70 chars without "--" or less. We allow
a bit larger.. */
boundary = boundary_find(boundaries,
msg + line_start + 2,
i - line_start - 2);
if (boundary != NULL)
break;
/* nope, we can skip over the line, just
leave the last char since it may be \r */
i--;
} else {
/* leave the last line to buffer, it may be
boundary */
i = line_start;
if (i > 2) i -= 2; /* leave the \r\n too */
line_start -= i;
}
io_buffer_skip(inbuf, i);
msg_size->physical_size += i;
msg_size->virtual_size += i;
startpos = size - i;
}
}
if (boundary != NULL) {
if (skip_over) {
/* leave the pointer right after the boundary */
line_start += 2 + boundary->len;
} else if (line_start > 0 && msg[line_start-1] == '\n') {
/* leave the \r\n before the boundary */
line_start--;
msg_size->lines--;
if (line_start > 0 && msg[line_start-1] == '\r')
line_start--;
else
missing_cr_count--;
}
startpos = line_start;
}
io_buffer_skip(inbuf, startpos);
msg_size->physical_size += startpos;
msg_size->virtual_size += startpos + missing_cr_count;
i_assert(msg_size->virtual_size >= msg_size->physical_size);
return boundary;
}
static MessagePart *message_parse_body(IOBuffer *inbuf,
MessageBoundary *boundaries,
MessageSize *body_size)
{
MessageBoundary *boundary;
if (boundaries == NULL) {
message_get_body_size(inbuf, body_size, (uoff_t)-1);
return NULL;
} else {
boundary = message_find_boundary(inbuf, boundaries,
body_size, FALSE);
return boundary == NULL ? NULL : boundary->part;
}
}
/* skip data until next boundary is found. if it's end boundary,
skip the footer as well. */
static MessagePart *message_skip_boundary(IOBuffer *inbuf,
MessageBoundary *boundaries,
MessageSize *boundary_size)
{
MessageBoundary *boundary;
unsigned char *msg;
unsigned int size;
int end_boundary;
boundary = message_find_boundary(inbuf, boundaries,
boundary_size, TRUE);
if (boundary == NULL)
return NULL;
/* now, see if it's end boundary */
end_boundary = FALSE;
while (io_buffer_read_data(inbuf, &msg, &size, 1) >= 0) {
if (size >= 2) {
end_boundary = msg[0] == '-' && msg[1] == '-';
break;
}
}
/* skip the rest of the line */
message_skip_line(inbuf, boundary_size);
if (end_boundary) {
/* skip the footer */
return message_parse_body(inbuf, boundaries, boundary_size);
}
return boundary == NULL ? NULL : boundary->part;
}