message-header-parser.c revision 6cb3c4f4276531258be706821e034f1f0a8cd276
c25356d5978632df6203437e1953bcb29e0c736fTimo Sirainen/* Copyright (c) 2002-2012 Dovecot authors, see the included COPYING file */
c25356d5978632df6203437e1953bcb29e0c736fTimo Sirainen
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen#include "lib.h"
d23c747de9d33966483fbdd41f08ad7766da7c5cTimo Sirainen#include "buffer.h"
d23c747de9d33966483fbdd41f08ad7766da7c5cTimo Sirainen#include "istream.h"
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen#include "str.h"
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen#include "message-size.h"
3e0bae44b65f5c46989fcef3d1e07203f496327eTimo Sirainen#include "message-header-parser.h"
3e0bae44b65f5c46989fcef3d1e07203f496327eTimo Sirainen
3e0bae44b65f5c46989fcef3d1e07203f496327eTimo Sirainenstruct message_header_parser_ctx {
3e0bae44b65f5c46989fcef3d1e07203f496327eTimo Sirainen struct message_header_line line;
3e0bae44b65f5c46989fcef3d1e07203f496327eTimo Sirainen
3e0bae44b65f5c46989fcef3d1e07203f496327eTimo Sirainen struct istream *input;
3e0bae44b65f5c46989fcef3d1e07203f496327eTimo Sirainen struct message_size *hdr_size;
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen string_t *name;
296dca49e4fe6046e0328c67ef1cf4b8077dec9cTimo Sirainen buffer_t *value_buf;
7fd51f7b0b4d990ec3cfef4e60ee685bf9fb32deTimo Sirainen size_t skip;
7fd51f7b0b4d990ec3cfef4e60ee685bf9fb32deTimo Sirainen
7fd51f7b0b4d990ec3cfef4e60ee685bf9fb32deTimo Sirainen enum message_header_parser_flags flags;
eb64c3586d854cddd693f0b811d897399076a441Timo Sirainen unsigned int skip_line:1;
eb64c3586d854cddd693f0b811d897399076a441Timo Sirainen unsigned int has_nuls:1;
eb64c3586d854cddd693f0b811d897399076a441Timo Sirainen};
eb64c3586d854cddd693f0b811d897399076a441Timo Sirainen
296dca49e4fe6046e0328c67ef1cf4b8077dec9cTimo Sirainenstruct message_header_parser_ctx *
eb64c3586d854cddd693f0b811d897399076a441Timo Sirainenmessage_parse_header_init(struct istream *input, struct message_size *hdr_size,
4ac2d38239cea8090154e17faefd77de5a71d882Timo Sirainen enum message_header_parser_flags flags)
eb64c3586d854cddd693f0b811d897399076a441Timo Sirainen{
4ac2d38239cea8090154e17faefd77de5a71d882Timo Sirainen struct message_header_parser_ctx *ctx;
eb64c3586d854cddd693f0b811d897399076a441Timo Sirainen
32e1554df9abca74fef0af2ba2e4c37e90a06cd0Timo Sirainen ctx = i_new(struct message_header_parser_ctx, 1);
eb64c3586d854cddd693f0b811d897399076a441Timo Sirainen ctx->input = input;
32e1554df9abca74fef0af2ba2e4c37e90a06cd0Timo Sirainen ctx->hdr_size = hdr_size;
a988c3fd9251806e38931a011aaa4006dd081cbdTimo Sirainen ctx->name = str_new(default_pool, 128);
16db188cfddce117500a161302f17ae691b4500eTimo Sirainen ctx->flags = flags;
b337d3b6871b878d6467d7d8ed600433af5da5a1Timo Sirainen ctx->value_buf = buffer_create_dynamic(default_pool, 4096);
b337d3b6871b878d6467d7d8ed600433af5da5a1Timo Sirainen
16db188cfddce117500a161302f17ae691b4500eTimo Sirainen if (hdr_size != NULL)
60b42c6dfdf9edcca8a96b380ef9a0adc60c2464Timo Sirainen memset(hdr_size, 0, sizeof(*hdr_size));
16db188cfddce117500a161302f17ae691b4500eTimo Sirainen return ctx;
16db188cfddce117500a161302f17ae691b4500eTimo Sirainen}
16db188cfddce117500a161302f17ae691b4500eTimo Sirainen
16db188cfddce117500a161302f17ae691b4500eTimo Sirainenvoid message_parse_header_deinit(struct message_header_parser_ctx **_ctx)
16db188cfddce117500a161302f17ae691b4500eTimo Sirainen{
296dca49e4fe6046e0328c67ef1cf4b8077dec9cTimo Sirainen struct message_header_parser_ctx *ctx = *_ctx;
296dca49e4fe6046e0328c67ef1cf4b8077dec9cTimo Sirainen
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen i_stream_skip(ctx->input, ctx->skip);
2b95b7a9f4f06e7640ef431d9e6efc2423cacf1aTimo Sirainen buffer_free(&ctx->value_buf);
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen str_free(&ctx->name);
6145bd3b17b9135b77b0b42409a0cc3fa0d1b946Timo Sirainen i_free(ctx);
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen
3e0bae44b65f5c46989fcef3d1e07203f496327eTimo Sirainen *_ctx = NULL;
296dca49e4fe6046e0328c67ef1cf4b8077dec9cTimo Sirainen}
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen
67c47dbb3fde79218320fd38a45c33f61bbf3012Timo Sirainenint message_parse_header_next(struct message_header_parser_ctx *ctx,
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen struct message_header_line **hdr_r)
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen{
af3f857bb3166ed99595e11a9d18e5b5cc670e1aTimo Sirainen struct message_header_line *line = &ctx->line;
af3f857bb3166ed99595e11a9d18e5b5cc670e1aTimo Sirainen const unsigned char *msg;
af3f857bb3166ed99595e11a9d18e5b5cc670e1aTimo Sirainen size_t i, size, startpos, colon_pos, parse_size;
af3f857bb3166ed99595e11a9d18e5b5cc670e1aTimo Sirainen int ret;
af3f857bb3166ed99595e11a9d18e5b5cc670e1aTimo Sirainen bool continued, continues, last_no_newline, last_crlf;
af3f857bb3166ed99595e11a9d18e5b5cc670e1aTimo Sirainen bool no_newline, crlf_newline;
af3f857bb3166ed99595e11a9d18e5b5cc670e1aTimo Sirainen
af3f857bb3166ed99595e11a9d18e5b5cc670e1aTimo Sirainen *hdr_r = NULL;
af3f857bb3166ed99595e11a9d18e5b5cc670e1aTimo Sirainen if (line->eoh)
af3f857bb3166ed99595e11a9d18e5b5cc670e1aTimo Sirainen return -1;
67c47dbb3fde79218320fd38a45c33f61bbf3012Timo Sirainen
c58906589cafc32df4c04ffbef933baadd3f2276Timo Sirainen if (ctx->skip > 0) {
47ede56f4e6eebfe631a1f0febf74d7adcdbcd00Timo Sirainen i_stream_skip(ctx->input, ctx->skip);
47ede56f4e6eebfe631a1f0febf74d7adcdbcd00Timo Sirainen ctx->skip = 0;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen }
c396c5cdd510d09aa35875ccfd643c5c21ed1f89Timo Sirainen
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen if (line->continues)
6145bd3b17b9135b77b0b42409a0cc3fa0d1b946Timo Sirainen colon_pos = 0;
0dffa25d211be541ee3c953b23566a1a990789dfTimo Sirainen else {
0dffa25d211be541ee3c953b23566a1a990789dfTimo Sirainen /* new header line */
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen line->name_offset = ctx->input->v_offset;
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen colon_pos = UINT_MAX;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen buffer_set_used_size(ctx->value_buf, 0);
d9250ee7e2815bb2116134b58f7c860578148d6cTimo Sirainen }
a10ed8c47534b4c6b6bf2711ccfe577e720a47b4Timo Sirainen
092018b35bb1dc5bd61848a38189fe6ac8f791ddTimo Sirainen no_newline = FALSE;
7327394e30c1020b9a2a49c72a7e3d0f7803e680Timo Sirainen crlf_newline = FALSE;
7327394e30c1020b9a2a49c72a7e3d0f7803e680Timo Sirainen continued = line->continues;
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen continues = FALSE;
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen
90804278df6586cceaf1b1b07a44713c01694048Timo Sirainen for (startpos = 0;;) {
90804278df6586cceaf1b1b07a44713c01694048Timo Sirainen ret = i_stream_read_data(ctx->input, &msg, &size, startpos+1);
90804278df6586cceaf1b1b07a44713c01694048Timo Sirainen if (ret >= 0) {
90804278df6586cceaf1b1b07a44713c01694048Timo Sirainen /* we want to know one byte in advance to find out
90804278df6586cceaf1b1b07a44713c01694048Timo Sirainen if it's multiline header */
90804278df6586cceaf1b1b07a44713c01694048Timo Sirainen parse_size = size == 0 ? 0 : size-1;
90804278df6586cceaf1b1b07a44713c01694048Timo Sirainen } else {
90804278df6586cceaf1b1b07a44713c01694048Timo Sirainen parse_size = size;
6145bd3b17b9135b77b0b42409a0cc3fa0d1b946Timo Sirainen }
6145bd3b17b9135b77b0b42409a0cc3fa0d1b946Timo Sirainen
6145bd3b17b9135b77b0b42409a0cc3fa0d1b946Timo Sirainen if (ret <= 0 && startpos == parse_size) {
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen if (ret == -1) {
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen if (startpos > 0) {
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen /* header ended unexpectedly. */
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen no_newline = TRUE;
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen ctx->skip = startpos;
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen break;
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen }
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen /* error / EOF with no bytes */
c8cf8a605e0ddea7cb36fe04551aeca5090e684bTimo Sirainen return -1;
c8cf8a605e0ddea7cb36fe04551aeca5090e684bTimo Sirainen }
c8cf8a605e0ddea7cb36fe04551aeca5090e684bTimo Sirainen
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen if (size > 0 && !ctx->skip_line &&
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen (msg[0] == '\n' ||
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen (msg[0] == '\r' && size > 1 && msg[1] == '\n'))) {
ef50336eefcb9ba99f73c6af37420eaf8857a39bTimo Sirainen /* end of headers - this mostly happens just
c0a87e5f3316a57e6f915882fa1951d0fbb74a61Timo Sirainen with mbox where headers are read separately
c0a87e5f3316a57e6f915882fa1951d0fbb74a61Timo Sirainen from body */
14376e0584c178306c400750352869cf2aaf6feeTimo Sirainen size = 0;
c0a87e5f3316a57e6f915882fa1951d0fbb74a61Timo Sirainen if (ctx->hdr_size != NULL)
68a4946b12583b88fa802e52ebee45cd96056772Timo Sirainen ctx->hdr_size->lines++;
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen if (msg[0] == '\r') {
9b706b345064ce8e8a657f54633f009a101298eaTimo Sirainen ctx->skip = 2;
9b706b345064ce8e8a657f54633f009a101298eaTimo Sirainen crlf_newline = TRUE;
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen } else {
c0a87e5f3316a57e6f915882fa1951d0fbb74a61Timo Sirainen ctx->skip = 1;
9b706b345064ce8e8a657f54633f009a101298eaTimo Sirainen if (ctx->hdr_size != NULL)
9b706b345064ce8e8a657f54633f009a101298eaTimo Sirainen ctx->hdr_size->virtual_size++;
288d6ef592719f2be3cad9f034e9be05f9839785Timo Sirainen }
288d6ef592719f2be3cad9f034e9be05f9839785Timo Sirainen break;
288d6ef592719f2be3cad9f034e9be05f9839785Timo Sirainen }
288d6ef592719f2be3cad9f034e9be05f9839785Timo Sirainen if (ret == 0 && !ctx->input->eof) {
9b706b345064ce8e8a657f54633f009a101298eaTimo Sirainen /* stream is nonblocking - need more data */
9b706b345064ce8e8a657f54633f009a101298eaTimo Sirainen return 0;
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen }
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen i_assert(size > 0);
c0a87e5f3316a57e6f915882fa1951d0fbb74a61Timo Sirainen
9b706b345064ce8e8a657f54633f009a101298eaTimo Sirainen /* a) line is larger than input buffer
9b706b345064ce8e8a657f54633f009a101298eaTimo Sirainen b) header ended unexpectedly */
32e1554df9abca74fef0af2ba2e4c37e90a06cd0Timo Sirainen if (colon_pos == UINT_MAX && ret == -2 && !continued) {
32e1554df9abca74fef0af2ba2e4c37e90a06cd0Timo Sirainen /* header name is huge. just skip it. */
c0a87e5f3316a57e6f915882fa1951d0fbb74a61Timo Sirainen i_assert(size > 1);
9b706b345064ce8e8a657f54633f009a101298eaTimo Sirainen if (msg[size-1] == '\r')
9b706b345064ce8e8a657f54633f009a101298eaTimo Sirainen size--;
a988c3fd9251806e38931a011aaa4006dd081cbdTimo Sirainen
a988c3fd9251806e38931a011aaa4006dd081cbdTimo Sirainen if (ctx->hdr_size != NULL) {
c0a87e5f3316a57e6f915882fa1951d0fbb74a61Timo Sirainen ctx->hdr_size->physical_size += size;
9b706b345064ce8e8a657f54633f009a101298eaTimo Sirainen ctx->hdr_size->virtual_size += size;
957d09e495c33ad1180f82152e5e87e6b51ab04bTimo Sirainen }
957d09e495c33ad1180f82152e5e87e6b51ab04bTimo Sirainen i_stream_skip(ctx->input, size);
9874ad56b94788297fdac4eae7cba5d651b48222Timo Sirainen ctx->skip_line = TRUE;
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen startpos = 0;
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen continue;
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen }
957d09e495c33ad1180f82152e5e87e6b51ab04bTimo Sirainen
957d09e495c33ad1180f82152e5e87e6b51ab04bTimo Sirainen if (ret == -2) {
957d09e495c33ad1180f82152e5e87e6b51ab04bTimo Sirainen /* go back to last LWSP if found. */
957d09e495c33ad1180f82152e5e87e6b51ab04bTimo Sirainen size_t min_pos = !continued ? colon_pos : 0;
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen for (i = size-1; i > min_pos; i--) {
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen if (IS_LWSP(msg[i])) {
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen size = i;
2615df45a8027948a474abe5e817b34b0499c171Timo Sirainen break;
a3f6d0302a83270253ff9d2ebd4fea0003e9ac95Timo Sirainen }
14b551180cb4ac7acac8b048d8d6d7278541d1f6Timo Sirainen }
14b551180cb4ac7acac8b048d8d6d7278541d1f6Timo Sirainen if (i == min_pos && (msg[size-1] == '\r' ||
14b551180cb4ac7acac8b048d8d6d7278541d1f6Timo Sirainen msg[size-1] == '\n')) {
14b551180cb4ac7acac8b048d8d6d7278541d1f6Timo Sirainen /* we may or may not have a full header,
14b551180cb4ac7acac8b048d8d6d7278541d1f6Timo Sirainen but we don't know until we get the
e8490a52a1bc71bc53034e68f464435684ad810fTimo Sirainen next character. leave out the
linefeed and finish the header on
the next run. */
size--;
if (size > 0 && msg[size-1] == '\r')
size--;
}
continues = TRUE;
}
no_newline = TRUE;
ctx->skip = size;
break;
}
/* find ':' */
if (colon_pos == UINT_MAX) {
for (i = startpos; i < parse_size; i++) {
if (msg[i] > ':')
continue;
if (msg[i] == ':' && !ctx->skip_line) {
colon_pos = i;
line->full_value_offset =
ctx->input->v_offset + i + 1;
break;
}
if (msg[i] == '\n') {
/* end of headers, or error */
break;
}
if (msg[i] == '\0')
ctx->has_nuls = TRUE;
}
} else {
i = startpos;
}
/* find '\n' */
for (; i < parse_size; i++) {
if (msg[i] <= '\n') {
if (msg[i] == '\n')
break;
if (msg[i] == '\0')
ctx->has_nuls = TRUE;
}
}
if (i < parse_size && i+1 == size && ret == -2) {
/* we don't know if the line continues. */
i++;
} else if (i < parse_size) {
/* got a line */
if (ctx->skip_line) {
/* skipping a line with a huge header name */
if (ctx->hdr_size != NULL) {
ctx->hdr_size->lines++;
ctx->hdr_size->physical_size += i + 1;
ctx->hdr_size->virtual_size += i + 1;
}
if (i == 0 || msg[i-1] != '\r') {
/* missing CR */
if (ctx->hdr_size != NULL)
ctx->hdr_size->virtual_size++;
}
i_stream_skip(ctx->input, i + 1);
startpos = 0;
ctx->skip_line = FALSE;
continue;
}
continues = i+1 < size && IS_LWSP(msg[i+1]);
if (ctx->hdr_size != NULL)
ctx->hdr_size->lines++;
if (i == 0 || msg[i-1] != '\r') {
/* missing CR */
if (ctx->hdr_size != NULL)
ctx->hdr_size->virtual_size++;
size = i;
} else {
size = i-1;
crlf_newline = TRUE;
}
ctx->skip = i+1;
break;
}
startpos = i;
}
last_crlf = line->crlf_newline &&
(ctx->flags & MESSAGE_HEADER_PARSER_FLAG_DROP_CR) == 0;
last_no_newline = line->no_newline ||
(ctx->flags & MESSAGE_HEADER_PARSER_FLAG_CLEAN_ONELINE) != 0;
line->continues = continues;
line->continued = continued;
line->crlf_newline = crlf_newline;
line->no_newline = no_newline;
if (size == 0) {
/* end of headers */
line->eoh = TRUE;
line->name_len = line->value_len = line->full_value_len = 0;
line->name = ""; line->value = line->full_value = NULL;
line->middle = NULL; line->middle_len = 0;
line->full_value_offset = line->name_offset;
line->continues = FALSE;
} else if (line->continued) {
line->value = msg;
line->value_len = size;
} else if (colon_pos == UINT_MAX) {
/* missing ':', assume the whole line is name */
line->value = NULL;
line->value_len = 0;
str_truncate(ctx->name, 0);
buffer_append(ctx->name, msg, size);
line->name = str_c(ctx->name);
line->name_len = str_len(ctx->name);
line->middle = NULL;
line->middle_len = 0;
} else {
size_t pos;
line->value = msg + colon_pos+1;
line->value_len = size - colon_pos - 1;
if (ctx->flags & MESSAGE_HEADER_PARSER_FLAG_SKIP_INITIAL_LWSP) {
/* get value. skip all LWSP after ':'. Note that
RFC2822 doesn't say we should, but history behind
it..
Exception to this is if the value consists only of
LWSP, then skip only the one LWSP after ':'. */
for (pos = 0; pos < line->value_len; pos++) {
if (!IS_LWSP(line->value[pos]))
break;
}
if (pos == line->value_len) {
/* everything was LWSP */
if (line->value_len > 0 &&
IS_LWSP(line->value[0]))
pos = 1;
}
} else {
pos = line->value_len > 0 &&
IS_LWSP(line->value[0]) ? 1 : 0;
}
line->value += pos;
line->value_len -= pos;
line->full_value_offset += pos;
/* get name, skip LWSP before ':' */
while (colon_pos > 0 && IS_LWSP(msg[colon_pos-1]))
colon_pos--;
str_truncate(ctx->name, 0);
/* use buffer_append() so the name won't be truncated if there
are NULs. */
buffer_append(ctx->name, msg, colon_pos);
str_append_c(ctx->name, '\0');
/* keep middle stored also in ctx->name so it's available
with use_full_value */
line->middle = msg + colon_pos;
line->middle_len = (size_t)(line->value - line->middle);
str_append_n(ctx->name, line->middle, line->middle_len);
line->name = str_c(ctx->name);
line->name_len = colon_pos;
line->middle = str_data(ctx->name) + line->name_len + 1;
}
if (!line->continued) {
/* first header line. make a copy of the line since we can't
really trust input stream not to lose it. */
buffer_append(ctx->value_buf, line->value, line->value_len);
line->value = line->full_value = ctx->value_buf->data;
line->full_value_len = line->value_len;
} else if (line->use_full_value) {
/* continue saving the full value. */
if (last_no_newline) {
/* line is longer than fit into our buffer, so we
were forced to break it into multiple
message_header_lines */
} else {
if (last_crlf)
buffer_append_c(ctx->value_buf, '\r');
buffer_append_c(ctx->value_buf, '\n');
}
if ((ctx->flags & MESSAGE_HEADER_PARSER_FLAG_CLEAN_ONELINE) &&
line->value_len > 0 && line->value[0] != ' ' &&
IS_LWSP(line->value[0])) {
buffer_append_c(ctx->value_buf, ' ');
buffer_append(ctx->value_buf,
line->value + 1, line->value_len - 1);
} else {
buffer_append(ctx->value_buf,
line->value, line->value_len);
}
line->full_value = buffer_get_data(ctx->value_buf,
&line->full_value_len);
} else {
/* we didn't want full_value, and this is a continued line. */
line->full_value = NULL;
line->full_value_len = 0;
}
/* always reset it */
line->use_full_value = FALSE;
if (ctx->hdr_size != NULL) {
ctx->hdr_size->physical_size += ctx->skip;
ctx->hdr_size->virtual_size += ctx->skip;
}
*hdr_r = line;
return 1;
}
bool message_parse_header_has_nuls(const struct message_header_parser_ctx *ctx)
{
return ctx->has_nuls;
}
#undef message_parse_header
void message_parse_header(struct istream *input, struct message_size *hdr_size,
enum message_header_parser_flags flags,
message_header_callback_t *callback, void *context)
{
struct message_header_parser_ctx *hdr_ctx;
struct message_header_line *hdr;
int ret;
hdr_ctx = message_parse_header_init(input, hdr_size, flags);
while ((ret = message_parse_header_next(hdr_ctx, &hdr)) > 0)
callback(hdr, context);
i_assert(ret != 0);
message_parse_header_deinit(&hdr_ctx);
/* call after the final skipping */
callback(NULL, context);
}