bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2011-2018 Dovecot authors, see the included COPYING file */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen#include "lib.h"
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen#include "buffer.h"
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen#include "unichar.h"
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen#include "message-parser.h"
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen#include "mail-html2text.h"
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen/* Zero-width space (​) apparently also belongs here, but that gets a
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen bit tricky to handle.. is it actually used anywhere? */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen#define HTML_WHITESPACE(c) \
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n')
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenenum html_state {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* regular text */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen HTML_STATE_TEXT,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* tag outside "quoted string" */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen HTML_STATE_TAG,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* tag inside "double quoted string" */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen HTML_STATE_TAG_DQUOTED,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* tag -> "escape\ */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen HTML_STATE_TAG_DQUOTED_ESCAPE,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* tag inside 'single quoted string' */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen HTML_STATE_TAG_SQUOTED,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* tag -> 'escape\ */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen HTML_STATE_TAG_SQUOTED_ESCAPE,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* comment */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen HTML_STATE_COMMENT,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* comment is ending, we've seen "--" and now just waiting for ">" */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen HTML_STATE_COMMENT_END,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* (java)script */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen HTML_STATE_SCRIPT,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* CSS style */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen HTML_STATE_STYLE,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* <![CDATA[...]]> */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen HTML_STATE_CDATA
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen};
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenstruct mail_html2text {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen enum mail_html2text_flags flags;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen enum html_state state;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen buffer_t *input;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen unsigned int quote_level;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen bool ignore_next_text;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen};
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenstatic struct {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen const char *name;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen unichar_t chr;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen} html_entities[] = {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen#include "html-entities.h"
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen};
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenstruct mail_html2text *
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenmail_html2text_init(enum mail_html2text_flags flags)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen{
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen struct mail_html2text *ht;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht = i_new(struct mail_html2text, 1);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->flags = flags;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->input = buffer_create_dynamic(default_pool, 512);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return ht;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen}
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenstatic size_t
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenparse_tag_name(struct mail_html2text *ht,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen const unsigned char *data, size_t size)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen{
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen size_t i;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (size >= 3 && memcmp(data, "!--", 3) == 0) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_COMMENT;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return 3 + 1;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (size >= 7 && i_memcasecmp(data, "script", 6) == 0 &&
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen (HTML_WHITESPACE(data[6]) || data[6] == '>')) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_SCRIPT;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return 7 + 1;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (size >= 6 && i_memcasecmp(data, "style", 5) == 0 &&
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen (HTML_WHITESPACE(data[5]) || data[5] == '>')) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_STYLE;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return 6 + 1;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (size >= 8 && i_memcasecmp(data, "![CDATA[", 8) == 0) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_CDATA;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return 8 + 1;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if ((ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) != 0) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (size >= 10 && i_memcasecmp(data, "blockquote", 10) == 0 &&
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen (HTML_WHITESPACE(data[10]) || data[10] == '>')) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->quote_level++;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TAG;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return 1;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen } else if (ht->quote_level > 0 &&
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen size >= 12 && i_memcasecmp(data, "/blockquote>", 12) == 0) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (--ht->quote_level == 0)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->ignore_next_text = FALSE;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TAG;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return 1;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (size < 12) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* can we see the whole tag name? */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen for (i = 0; i < size; i++) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (HTML_WHITESPACE(data[i]) || data[i] == '>')
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (i == size) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* need more data */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return 0;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TAG;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return 1;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen}
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenstatic bool html_entity_get_unichar(const char *name, unichar_t *chr_r)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen{
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi unichar_t chr;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi for (size_t i = 0; i < N_ELEMENTS(html_entities); i++) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (strcasecmp(html_entities[i].name, name) == 0) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen *chr_r = html_entities[i].chr;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return TRUE;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi /* maybe it's just encoded binary byte
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi it can be &#nnn; or &#xnnn;
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi */
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi if (name[0] == '#' &&
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi ((name[1] == 'x' &&
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi str_to_uint32_hex(name+2, &chr) == 0) ||
5c1837529e6957da3e389683c43bd006859395e5Aki Tuomi str_to_uint32(name+1, &chr) == 0) &&
5c1837529e6957da3e389683c43bd006859395e5Aki Tuomi uni_is_valid_ucs4(chr)) {
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi *chr_r = chr;
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi return TRUE;
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi }
eb0cfb4f325faba850d85ee96ddec5ac6deedf89Aki Tuomi
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return FALSE;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen}
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenstatic size_t parse_entity(const unsigned char *data, size_t size,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen buffer_t *output)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen{
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen char entity[10];
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen unichar_t chr;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen size_t i;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen for (i = 0; i < size; i++) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (HTML_WHITESPACE(data[i]) || i >= sizeof(entity)) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* broken entity */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return 1;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (data[i] == ';')
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (i == size)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return 0;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen i_assert(i < sizeof(entity));
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen memcpy(entity, data, i); entity[i] = '\0';
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (html_entity_get_unichar(entity, &chr))
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen uni_ucs4_to_utf8_c(chr, output);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return i + 1 + 1;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen}
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenstatic void mail_html2text_add_space(buffer_t *output)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen{
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen const unsigned char *data = output->data;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (output->used > 0 && data[output->used-1] != ' ')
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen buffer_append_c(output, ' ');
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen}
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenstatic size_t
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenparse_data(struct mail_html2text *ht,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen const unsigned char *data, size_t size, buffer_t *output)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen{
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen size_t i, ret;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen for (i = 0; i < size; i++) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen char c = data[i];
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen switch (ht->state) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen case HTML_STATE_TEXT:
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (c == '<') {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ret = parse_tag_name(ht, data+i+1, size-i-1);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (ret == 0)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return i;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen i += ret - 1;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen } else if (c == '&') {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ret = parse_entity(data+i+1, size-i-1, output);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (ret == 0)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return i;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen i += ret - 1;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen } else if (ht->quote_level == 0) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen buffer_append_c(output, c);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen case HTML_STATE_TAG:
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (c == '"')
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TAG_DQUOTED;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen else if (c == '\'')
47dba7db73ded86c3c1f43b07b6752895b523bd6Phil Carmody ht->state = HTML_STATE_TAG_SQUOTED;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen else if (c == '>') {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TEXT;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen mail_html2text_add_space(output);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen case HTML_STATE_TAG_DQUOTED:
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (c == '"')
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TAG;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen else if (c == '\\')
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TAG_DQUOTED_ESCAPE;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen case HTML_STATE_TAG_DQUOTED_ESCAPE:
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TAG_DQUOTED;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen case HTML_STATE_TAG_SQUOTED:
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (c == '\'')
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TAG;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen else if (c == '\\')
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TAG_SQUOTED_ESCAPE;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen case HTML_STATE_TAG_SQUOTED_ESCAPE:
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TAG_SQUOTED;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen case HTML_STATE_COMMENT:
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (c == '-') {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (i+1 == size)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return i;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (data[i+1] == '-') {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_COMMENT_END;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen i++;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen case HTML_STATE_COMMENT_END:
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (c == '>')
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TEXT;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen else if (!HTML_WHITESPACE(c))
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_COMMENT;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen case HTML_STATE_SCRIPT:
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (c == '<') {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen unsigned int max_len = I_MIN(size-i, 9);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (i_memcasecmp(data+i, "</script>", max_len) == 0) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (max_len < 9)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return i;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen mail_html2text_add_space(output);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TEXT;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen i += 8;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen case HTML_STATE_STYLE:
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (c == '<') {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen unsigned int max_len = I_MIN(size-i, 8);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (i_memcasecmp(data+i, "</style>", max_len) == 0) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (max_len < 8)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return i;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen mail_html2text_add_space(output);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TEXT;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen i += 7;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen case HTML_STATE_CDATA:
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (c == ']') {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen unsigned int max_len = I_MIN(size-i, 3);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (i_memcasecmp(data+i, "]]>", max_len) == 0) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (max_len < 3)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return i;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->state = HTML_STATE_TEXT;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen i += 2;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (ht->quote_level == 0)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen buffer_append_c(output, c);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen break;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return i;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen}
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenvoid mail_html2text_more(struct mail_html2text *ht,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen const unsigned char *data, size_t size,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen buffer_t *output)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen{
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen size_t pos, inc_size, buf_orig_size;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen i_assert(size > 0);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen while (ht->input->used > 0) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* we didn't get enough input the last time to know
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen what to do. */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen buf_orig_size = ht->input->used;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen inc_size = I_MIN(size, 128);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen buffer_append(ht->input, data, inc_size);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen pos = parse_data(ht, ht->input->data,
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen ht->input->used, output);
e61d5a4bad247e62a257bb1ed6c483923d10c2a8Timo Sirainen if (pos == 0) {
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen /* we need to add more data into buffer */
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen data += inc_size;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen size -= inc_size;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen if (size == 0)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen return;
e61d5a4bad247e62a257bb1ed6c483923d10c2a8Timo Sirainen } else if (pos >= buf_orig_size) {
e61d5a4bad247e62a257bb1ed6c483923d10c2a8Timo Sirainen /* we parsed forward */
e61d5a4bad247e62a257bb1ed6c483923d10c2a8Timo Sirainen data += pos - buf_orig_size;
e61d5a4bad247e62a257bb1ed6c483923d10c2a8Timo Sirainen size -= pos - buf_orig_size;
e61d5a4bad247e62a257bb1ed6c483923d10c2a8Timo Sirainen buffer_set_used_size(ht->input, 0);
e61d5a4bad247e62a257bb1ed6c483923d10c2a8Timo Sirainen } else {
e61d5a4bad247e62a257bb1ed6c483923d10c2a8Timo Sirainen /* invalid input - eat away what we parsed so far
e61d5a4bad247e62a257bb1ed6c483923d10c2a8Timo Sirainen and retry */
e61d5a4bad247e62a257bb1ed6c483923d10c2a8Timo Sirainen buffer_set_used_size(ht->input, buf_orig_size);
e61d5a4bad247e62a257bb1ed6c483923d10c2a8Timo Sirainen buffer_delete(ht->input, 0, pos);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen }
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen pos = parse_data(ht, data, size, output);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen buffer_append(ht->input, data + pos, size - pos);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen}
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
c9141125278100269eb3a907c911afe78c46717cTimo Sirainenvoid mail_html2text_deinit(struct mail_html2text **_ht)
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen{
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen struct mail_html2text *ht = *_ht;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen
86174ed1451d1ca877e02cb06dc484c34b707180Josef 'Jeff' Sipek if (ht == NULL)
86174ed1451d1ca877e02cb06dc484c34b707180Josef 'Jeff' Sipek return;
86174ed1451d1ca877e02cb06dc484c34b707180Josef 'Jeff' Sipek
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen *_ht = NULL;
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen buffer_free(&ht->input);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen i_free(ht);
c9141125278100269eb3a907c911afe78c46717cTimo Sirainen}