bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2015-2018 Dovecot authors, see the included COPYING file */
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen#include "lib.h"
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen#include "buffer.h"
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen#include "str.h"
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen#include "istream.h"
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen#include "mail-html2text.h"
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen#include "message-parser.h"
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen#include "message-decoder.h"
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen#include "message-snippet.h"
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainenenum snippet_state {
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen /* beginning of the line */
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen SNIPPET_STATE_NEWLINE = 0,
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen /* within normal text */
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen SNIPPET_STATE_NORMAL,
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen /* within quoted text - skip until EOL */
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen SNIPPET_STATE_QUOTED
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen};
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainenstruct snippet_context {
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen string_t *snippet;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen unsigned int chars_left;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen enum snippet_state state;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen bool add_whitespace;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen struct mail_html2text *html2text;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen buffer_t *plain_output;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen};
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainenstatic bool snippet_generate(struct snippet_context *ctx,
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen const unsigned char *data, size_t size)
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen{
73083841b92505091b7d1070554930651b91b701Aki Tuomi size_t i, count;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen if (ctx->html2text != NULL) {
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen buffer_set_used_size(ctx->plain_output, 0);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen mail_html2text_more(ctx->html2text, data, size,
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen ctx->plain_output);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen data = ctx->plain_output->data;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen size = ctx->plain_output->used;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen }
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen /* message-decoder should feed us only valid and complete
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen UTF-8 input */
73083841b92505091b7d1070554930651b91b701Aki Tuomi
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen for (i = 0; i < size; i += count) {
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen count = 1;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen switch (ctx->state) {
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen case SNIPPET_STATE_NEWLINE:
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen if (data[i] == '>' && ctx->html2text == NULL) {
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen ctx->state = SNIPPET_STATE_QUOTED;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen break;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen }
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen ctx->state = SNIPPET_STATE_NORMAL;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen /* fallthrough */
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen case SNIPPET_STATE_NORMAL:
73083841b92505091b7d1070554930651b91b701Aki Tuomi if (size-i >= 3 &&
099737facc44553349a15521cdfcc65b112054caTimo Sirainen ((data[i] == 0xEF && data[i+1] == 0xBB && data[i+2] == 0xBF) ||
099737facc44553349a15521cdfcc65b112054caTimo Sirainen (data[i] == 0xBF && data[i+1] == 0xBB && data[i+2] == 0xEF))) {
73083841b92505091b7d1070554930651b91b701Aki Tuomi count += 2; /* because we skip +1 next */
73083841b92505091b7d1070554930651b91b701Aki Tuomi break;
73083841b92505091b7d1070554930651b91b701Aki Tuomi }
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen if (data[i] == '\r' || data[i] == '\n' ||
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen data[i] == '\t' || data[i] == ' ') {
73083841b92505091b7d1070554930651b91b701Aki Tuomi /* skip any leading whitespace */
73083841b92505091b7d1070554930651b91b701Aki Tuomi if (str_len(ctx->snippet) > 1)
73083841b92505091b7d1070554930651b91b701Aki Tuomi ctx->add_whitespace = TRUE;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen if (data[i] == '\n')
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen ctx->state = SNIPPET_STATE_NEWLINE;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen break;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen }
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen if (ctx->add_whitespace) {
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen str_append_c(ctx->snippet, ' ');
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen ctx->add_whitespace = FALSE;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen if (ctx->chars_left-- == 0)
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen return FALSE;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen }
a05819736f348d0c5ac8b4966ac6b04c21e1a391Timo Sirainen if (ctx->chars_left == 0)
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen return FALSE;
a05819736f348d0c5ac8b4966ac6b04c21e1a391Timo Sirainen ctx->chars_left--;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen count = uni_utf8_char_bytes(data[i]);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen i_assert(i + count <= size);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen str_append_n(ctx->snippet, data + i, count);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen break;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen case SNIPPET_STATE_QUOTED:
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen if (data[i] == '\n')
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen ctx->state = SNIPPET_STATE_NEWLINE;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen break;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen }
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen }
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen return TRUE;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen}
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainenint message_snippet_generate(struct istream *input,
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen unsigned int max_snippet_chars,
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen string_t *snippet)
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen{
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen struct message_parser_ctx *parser;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen struct message_part *parts;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen struct message_decoder_context *decoder;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen struct message_block raw_block, block;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen struct snippet_context ctx;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen pool_t pool;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen int ret;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen
efe78d3ba24fc866af1c79b9223dc0809ba26cadStephan Bosch i_zero(&ctx);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen pool = pool_alloconly_create("message snippet", 1024);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen ctx.snippet = snippet;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen ctx.chars_left = max_snippet_chars;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen parser = message_parser_init(pool_datastack_create(), input, 0, 0);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen decoder = message_decoder_init(NULL, 0);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen if (!message_decoder_decode_next_block(decoder, &raw_block, &block))
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen continue;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen if (block.size == 0) {
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen const char *ct;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen if (block.hdr != NULL)
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen continue;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen /* end of headers - verify that we can use this
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen Content-Type. we get here only once, because we
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen always handle only one non-multipart MIME part. */
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen ct = message_decoder_current_content_type(decoder);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen if (ct == NULL)
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen /* text/plain */ ;
00a50d6dc74e15b5b83afcb8c12c2109a2ca376dTimo Sirainen else if (mail_html2text_content_type_match(ct)) {
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen ctx.html2text = mail_html2text_init(MAIL_HTML2TEXT_FLAG_SKIP_QUOTED);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen ctx.plain_output = buffer_create_dynamic(pool, 1024);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen } else if (strncasecmp(ct, "text/", 5) != 0)
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen break;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen continue;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen }
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen if (!snippet_generate(&ctx, block.data, block.size))
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen break;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen }
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen i_assert(ret != 0);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen message_decoder_deinit(&decoder);
12e5ac049bd74f8b98d9dc62adcb0bf3217beef6Martti Rannanjärvi message_parser_deinit(&parser, &parts);
a2e1929a266a0260fd429fb587588f51d2bd8a96Josef 'Jeff' Sipek mail_html2text_deinit(&ctx.html2text);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen pool_unref(&pool);
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen return input->stream_errno == 0 ? 0 : -1;
7f7be2cbf68f8a202a688d5bc50f82483d461643Timo Sirainen}