fts-parser-html.c revision 96f62326b890207d844f41baa8f736a04d973c30
/* Copyright (c) 2011 Dovecot authors, see the included COPYING file */
#include "lib.h"
#include "buffer.h"
#include "unichar.h"
#include "message-parser.h"
#include "fts-parser.h"
/* Zero-width space (​) apparently also belongs here, but that gets a
bit tricky to handle.. is it actually used anywhere? */
#define HTML_WHITESPACE(c) \
((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n')
enum html_state {
/* regular text */
/* tag outside "quoted string" */
/* tag inside "quoted string" */
/* tag -> "escape\ */
/* comment */
/* comment is ending, we've seen "--" and now just waiting for ">" */
};
struct html_fts_parser {
struct fts_parser parser;
enum html_state state;
bool ignore_next_text;
};
struct {
const char *name;
} html_entities[] = {
#include "html-entities.h"
};
static struct fts_parser *
const char *content_type,
const char *content_disposition ATTR_UNUSED)
{
struct html_fts_parser *parser;
return NULL;
}
static bool
{
size_t i = 1;
return 3;
}
i = 5;
i = 6;
} else if (size <= 6) {
/* need more data */
return 0;
} else {
return 1;
}
return 1;
}
{
unsigned int i;
for (i = 0; i < N_ELEMENTS(html_entities); i++) {
return TRUE;
}
}
return FALSE;
}
{
char entity[10];
size_t i;
for (i = 0; i < size; i++) {
if (data[i] == ';')
break;
/* broken entity */
return 1;
}
}
if (i == size)
return 0;
return i + 1;
}
{
}
static size_t
{
for (i = 0; i < size; i++) {
char c = data[i];
case HTML_STATE_TEXT:
if (c == '<') {
if (ret == 0)
return i;
i += ret - 1;
} else if (c == '&') {
if (ret == 0)
return i;
i += ret - 1;
} else {
}
break;
case HTML_STATE_TAG:
if (c == '"')
else if (c == '>') {
}
break;
case HTML_STATE_TAG_QUOTED:
if (c == '"')
else if (c == '\\')
break;
break;
case HTML_STATE_IGNORE:
if (c == '<') {
}
break;
case HTML_STATE_COMMENT:
if (c == '-') {
if (i+1 == size)
return i;
i++;
}
}
break;
case HTML_STATE_COMMENT_END:
if (c == '>')
else if (!HTML_WHITESPACE(c))
break;
}
}
return i;
}
struct message_block *block)
{
/* we didn't get enough input the last time to know
what to do. */
if (size == 0) {
/* we're at EOF and can't finish this */
} else {
}
}
}
{
}
struct fts_parser_vfuncs fts_parser_html = {
};