/* Copyright (c) 2011-2018 Dovecot authors, see the included COPYING file */
#include "lib.h"
#include "buffer.h"
#include "unichar.h"
#include "message-parser.h"
#include "mail-html2text.h"
/* Zero-width space (​) apparently also belongs here, but that gets a
bit tricky to handle.. is it actually used anywhere? */
#define HTML_WHITESPACE(c) \
((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n')
enum html_state {
/* regular text */
/* tag outside "quoted string" */
/* tag inside "double quoted string" */
/* tag -> "escape\ */
/* tag inside 'single quoted string' */
/* tag -> 'escape\ */
/* comment */
/* comment is ending, we've seen "--" and now just waiting for ">" */
/* (java)script */
/* CSS style */
/* <![CDATA[...]]> */
};
struct mail_html2text {
unsigned int quote_level;
bool ignore_next_text;
};
static struct {
const char *name;
} html_entities[] = {
#include "html-entities.h"
};
struct mail_html2text *
{
return ht;
}
static size_t
{
size_t i;
return 3 + 1;
}
return 7 + 1;
}
return 6 + 1;
}
return 8 + 1;
}
ht->quote_level++;
return 1;
} else if (ht->quote_level > 0 &&
if (--ht->quote_level == 0)
return 1;
}
}
if (size < 12) {
/* can we see the whole tag name? */
for (i = 0; i < size; i++) {
break;
}
if (i == size) {
/* need more data */
return 0;
}
}
return 1;
}
{
return TRUE;
}
}
/* maybe it's just encoded binary byte
it can be &#nnn; or &#xnnn;
*/
if (name[0] == '#' &&
uni_is_valid_ucs4(chr)) {
return TRUE;
}
return FALSE;
}
{
size_t i;
for (i = 0; i < size; i++) {
/* broken entity */
return 1;
}
if (data[i] == ';')
break;
}
if (i == size)
return 0;
return i + 1 + 1;
}
{
}
static size_t
{
for (i = 0; i < size; i++) {
char c = data[i];
case HTML_STATE_TEXT:
if (c == '<') {
if (ret == 0)
return i;
i += ret - 1;
} else if (c == '&') {
if (ret == 0)
return i;
i += ret - 1;
} else if (ht->quote_level == 0) {
buffer_append_c(output, c);
}
break;
case HTML_STATE_TAG:
if (c == '"')
else if (c == '\'')
else if (c == '>') {
}
break;
case HTML_STATE_TAG_DQUOTED:
if (c == '"')
else if (c == '\\')
break;
break;
case HTML_STATE_TAG_SQUOTED:
if (c == '\'')
else if (c == '\\')
break;
break;
case HTML_STATE_COMMENT:
if (c == '-') {
if (i+1 == size)
return i;
i++;
}
}
break;
case HTML_STATE_COMMENT_END:
if (c == '>')
else if (!HTML_WHITESPACE(c))
break;
case HTML_STATE_SCRIPT:
if (c == '<') {
if (max_len < 9)
return i;
i += 8;
}
}
break;
case HTML_STATE_STYLE:
if (c == '<') {
if (max_len < 8)
return i;
i += 7;
}
}
break;
case HTML_STATE_CDATA:
if (c == ']') {
if (max_len < 3)
return i;
i += 2;
break;
}
}
if (ht->quote_level == 0)
buffer_append_c(output, c);
break;
}
}
return i;
}
{
/* we didn't get enough input the last time to know
what to do. */
if (pos == 0) {
/* we need to add more data into buffer */
if (size == 0)
return;
} else if (pos >= buf_orig_size) {
/* we parsed forward */
} else {
/* invalid input - eat away what we parsed so far
and retry */
}
}
}
{
return;
}