message-snippet.c revision 7f7be2cbf68f8a202a688d5bc50f82483d461643
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
659fe5d24825b160cae512538088020d97a60239Timo Sirainen#include "lib.h"
659fe5d24825b160cae512538088020d97a60239Timo Sirainen#include "buffer.h"
659fe5d24825b160cae512538088020d97a60239Timo Sirainen#include "str.h"
659fe5d24825b160cae512538088020d97a60239Timo Sirainen#include "istream.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "mail-html2text.h"
16f816d3f3c32ae3351834253f52ddd0212bcbf3Timo Sirainen#include "message-parser.h"
503a863a317acba125a4e46435694e35fad769e4Timo Sirainen#include "message-decoder.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen#include "message-snippet.h"
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
72cbf33ae81fde08384d30c779ff540752d9256cTimo Sirainenenum snippet_state {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen /* beginning of the line */
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen SNIPPET_STATE_NEWLINE = 0,
024815ea2ffdda9ea79919f18e865663977f73eaTimo Sirainen /* within normal text */
16c89b1260c9d07c01c83a9219424d3727069b2eTimo Sirainen SNIPPET_STATE_NORMAL,
024815ea2ffdda9ea79919f18e865663977f73eaTimo Sirainen /* within quoted text - skip until EOL */
d6badc27cd6e8d3398877b6766cb0aaeef3a7800Timo Sirainen SNIPPET_STATE_QUOTED
d6badc27cd6e8d3398877b6766cb0aaeef3a7800Timo Sirainen};
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen
2674b4f0cf8f3c203d8e56b29735f5e267038dafTimo Sirainenstruct snippet_context {
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen string_t *snippet;
8d80659e504ffb34bb0c6a633184fece35751b18Timo Sirainen unsigned int chars_left;
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen enum snippet_state state;
1175f27441385a7011629f295f42708f9a3a4ffcTimo Sirainen bool add_whitespace;
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen struct mail_html2text *html2text;
8d80659e504ffb34bb0c6a633184fece35751b18Timo Sirainen buffer_t *plain_output;
5a07b37a9df398b5189c14872a600384208ab74bTimo Sirainen};
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainenstatic bool snippet_generate(struct snippet_context *ctx,
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen const unsigned char *data, size_t size)
5a07b37a9df398b5189c14872a600384208ab74bTimo Sirainen{
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen unsigned int i, count;
1175f27441385a7011629f295f42708f9a3a4ffcTimo Sirainen
1175f27441385a7011629f295f42708f9a3a4ffcTimo Sirainen if (ctx->html2text != NULL) {
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen buffer_set_used_size(ctx->plain_output, 0);
bb10ebcf076c959c752f583746d83805d7686df8Timo Sirainen mail_html2text_more(ctx->html2text, data, size,
bb10ebcf076c959c752f583746d83805d7686df8Timo Sirainen ctx->plain_output);
8d80659e504ffb34bb0c6a633184fece35751b18Timo Sirainen data = ctx->plain_output->data;
16c89b1260c9d07c01c83a9219424d3727069b2eTimo Sirainen size = ctx->plain_output->used;
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen }
bb10ebcf076c959c752f583746d83805d7686df8Timo Sirainen
bb10ebcf076c959c752f583746d83805d7686df8Timo Sirainen /* message-decoder should feed us only valid and complete
bb10ebcf076c959c752f583746d83805d7686df8Timo Sirainen UTF-8 input */
bb10ebcf076c959c752f583746d83805d7686df8Timo Sirainen for (i = 0; i < size; i += count) {
faed8babca9914257f34fb2e603d74016d563b2dTimo Sirainen count = 1;
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen switch (ctx->state) {
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen case SNIPPET_STATE_NEWLINE:
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen if (data[i] == '>' && ctx->html2text == NULL) {
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen ctx->state = SNIPPET_STATE_QUOTED;
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen break;
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen }
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen ctx->state = SNIPPET_STATE_NORMAL;
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen /* fallthrough */
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen case SNIPPET_STATE_NORMAL:
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen if (data[i] == '\r' || data[i] == '\n' ||
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen data[i] == '\t' || data[i] == ' ') {
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen ctx->add_whitespace = TRUE;
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen if (data[i] == '\n')
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen ctx->state = SNIPPET_STATE_NEWLINE;
287ba82a8da3eaa473b5735d4eeac2fb4c5d8117Timo Sirainen break;
e8acc691a14a6d0884c5ca9aa4d8507f1e082040Timo Sirainen }
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen if (ctx->add_whitespace) {
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen str_append_c(ctx->snippet, ' ');
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen ctx->add_whitespace = FALSE;
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen if (ctx->chars_left-- == 0)
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen return FALSE;
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen }
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen if (ctx->chars_left-- == 0)
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen return FALSE;
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen count = uni_utf8_char_bytes(data[i]);
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen i_assert(i + count <= size);
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen str_append_n(ctx->snippet, data + i, count);
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen break;
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen case SNIPPET_STATE_QUOTED:
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen if (data[i] == '\n')
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen ctx->state = SNIPPET_STATE_NEWLINE;
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen break;
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen }
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen }
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen return TRUE;
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainen}
2674b4f0cf8f3c203d8e56b29735f5e267038dafTimo Sirainen
eac3948d67eff8623d51aeaea9eca582f3aec677Timo Sirainenint message_snippet_generate(struct istream *input,
46c31f64b9f0949f00b7819f45b22f2d64b2ea27Timo Sirainen unsigned int max_snippet_chars,
e8acc691a14a6d0884c5ca9aa4d8507f1e082040Timo Sirainen string_t *snippet)
d5cebe7f98e63d4e2822863ef2faa4971e8b3a5dTimo Sirainen{
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen struct message_parser_ctx *parser;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen struct message_part *parts;
0cb2e8eb55e70f8ebe1e8349bdf49e4cbe5d8834Timo Sirainen struct message_decoder_context *decoder;
48136ae5a0eb49daa44e343553f3688a500307e2Timo Sirainen struct message_block raw_block, block;
2674b4f0cf8f3c203d8e56b29735f5e267038dafTimo Sirainen struct snippet_context ctx;
48136ae5a0eb49daa44e343553f3688a500307e2Timo Sirainen pool_t pool;
48136ae5a0eb49daa44e343553f3688a500307e2Timo Sirainen int ret;
48136ae5a0eb49daa44e343553f3688a500307e2Timo Sirainen
48136ae5a0eb49daa44e343553f3688a500307e2Timo Sirainen memset(&ctx, 0, sizeof(ctx));
e376693bfa3985232c41df99c7010fca22612c89Timo Sirainen pool = pool_alloconly_create("message snippet", 1024);
e376693bfa3985232c41df99c7010fca22612c89Timo Sirainen ctx.snippet = snippet;
e376693bfa3985232c41df99c7010fca22612c89Timo Sirainen ctx.chars_left = max_snippet_chars;
e376693bfa3985232c41df99c7010fca22612c89Timo Sirainen
e376693bfa3985232c41df99c7010fca22612c89Timo Sirainen parser = message_parser_init(pool_datastack_create(), input, 0, 0);
e376693bfa3985232c41df99c7010fca22612c89Timo Sirainen decoder = message_decoder_init(NULL, 0);
e376693bfa3985232c41df99c7010fca22612c89Timo Sirainen while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
09c3a491f4f6ccebe290c7709bdc0d79a187610bTimo Sirainen if (!message_decoder_decode_next_block(decoder, &raw_block, &block))
09c3a491f4f6ccebe290c7709bdc0d79a187610bTimo Sirainen continue;
09c3a491f4f6ccebe290c7709bdc0d79a187610bTimo Sirainen if (block.size == 0) {
09c3a491f4f6ccebe290c7709bdc0d79a187610bTimo Sirainen const char *ct;
09c3a491f4f6ccebe290c7709bdc0d79a187610bTimo Sirainen
d5cebe7f98e63d4e2822863ef2faa4971e8b3a5dTimo Sirainen if (block.hdr != NULL)
09c3a491f4f6ccebe290c7709bdc0d79a187610bTimo Sirainen continue;
d5cebe7f98e63d4e2822863ef2faa4971e8b3a5dTimo Sirainen
d5cebe7f98e63d4e2822863ef2faa4971e8b3a5dTimo Sirainen /* end of headers - verify that we can use this
d5cebe7f98e63d4e2822863ef2faa4971e8b3a5dTimo Sirainen Content-Type. we get here only once, because we
09c3a491f4f6ccebe290c7709bdc0d79a187610bTimo Sirainen always handle only one non-multipart MIME part. */
09c3a491f4f6ccebe290c7709bdc0d79a187610bTimo Sirainen ct = message_decoder_current_content_type(decoder);
09c3a491f4f6ccebe290c7709bdc0d79a187610bTimo Sirainen if (ct == NULL)
09c3a491f4f6ccebe290c7709bdc0d79a187610bTimo Sirainen /* text/plain */ ;
8d80659e504ffb34bb0c6a633184fece35751b18Timo Sirainen else if (strcasecmp(ct, "text/html") == 0) {
8d80659e504ffb34bb0c6a633184fece35751b18Timo Sirainen ctx.html2text = mail_html2text_init(MAIL_HTML2TEXT_FLAG_SKIP_QUOTED);
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen ctx.plain_output = buffer_create_dynamic(pool, 1024);
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen } else if (strncasecmp(ct, "text/", 5) != 0)
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen break;
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen continue;
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen }
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen if (!snippet_generate(&ctx, block.data, block.size))
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen break;
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen }
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen i_assert(ret != 0);
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen message_decoder_deinit(&decoder);
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen if (message_parser_deinit(&parser, &parts) < 0)
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen i_unreached();
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen if (ctx.html2text != NULL)
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen mail_html2text_deinit(&ctx.html2text);
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen pool_unref(&pool);
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen return input->stream_errno == 0 ? 0 : -1;
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen}
5ada3f57a970f226eb29956d30f66afc3537200dTimo Sirainen