bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2011-2018 Dovecot authors, see the included COPYING file */
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen#include "lib.h"
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen#include "buffer.h"
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen#include "unichar.h"
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen#include "message-parser.h"
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen#include "fts-parser.h"
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen
0b3e92b6043435c5aa9f1cf1d04b632f3e19abd9Phil Carmodystatic const struct fts_parser_vfuncs *parsers[] = {
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen &fts_parser_html,
ce87b647a0418ec4a6f4e860a9c918b2331ab353Timo Sirainen &fts_parser_script,
ce87b647a0418ec4a6f4e860a9c918b2331ab353Timo Sirainen &fts_parser_tika
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen};
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen
657f76e8b68cac605c23171862f66f2bf070eed1Timo Sirainenstatic const char *plaintext_content_types[] = {
657f76e8b68cac605c23171862f66f2bf070eed1Timo Sirainen "text/plain",
657f76e8b68cac605c23171862f66f2bf070eed1Timo Sirainen "message/delivery-status",
657f76e8b68cac605c23171862f66f2bf070eed1Timo Sirainen "message/disposition-notification",
0c4a3ccaab8f720b1d62e305810a5de4aa7e159fTimo Sirainen "application/pgp-signature",
0c4a3ccaab8f720b1d62e305810a5de4aa7e159fTimo Sirainen NULL
657f76e8b68cac605c23171862f66f2bf070eed1Timo Sirainen};
657f76e8b68cac605c23171862f66f2bf070eed1Timo Sirainen
43f4e550efe7afa72aab3a7ded7ad7ce606526fbSergey Kitovbool fts_parser_init(struct fts_parser_context *parser_context,
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen struct fts_parser **parser_r)
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen{
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen unsigned int i;
43f4e550efe7afa72aab3a7ded7ad7ce606526fbSergey Kitov i_assert(parser_context->user != NULL);
43f4e550efe7afa72aab3a7ded7ad7ce606526fbSergey Kitov i_assert(parser_context->content_type != NULL);
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen
43f4e550efe7afa72aab3a7ded7ad7ce606526fbSergey Kitov if (str_array_find(plaintext_content_types, parser_context->content_type)) {
4682f86ba3e1e019fc6f348d95cec17a429287cfTimo Sirainen /* we probably don't want/need to allow parsers to handle
4682f86ba3e1e019fc6f348d95cec17a429287cfTimo Sirainen plaintext? */
38a79b5a7ac2b1565cde1a128ae97d7e44e43f73Teemu Huovila return FALSE;
4682f86ba3e1e019fc6f348d95cec17a429287cfTimo Sirainen }
4682f86ba3e1e019fc6f348d95cec17a429287cfTimo Sirainen
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen for (i = 0; i < N_ELEMENTS(parsers); i++) {
43f4e550efe7afa72aab3a7ded7ad7ce606526fbSergey Kitov *parser_r = parsers[i]->try_init(parser_context);
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen if (*parser_r != NULL)
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen return TRUE;
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen }
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen return FALSE;
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen}
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainenstruct fts_parser *fts_parser_text_init(void)
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen{
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen return i_new(struct fts_parser, 1);
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen}
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainenstatic bool data_has_nuls(const unsigned char *data, size_t size)
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen{
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen size_t i;
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen for (i = 0; i < size; i++) {
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen if (data[i] == '\0')
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen return TRUE;
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen }
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen return FALSE;
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen}
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainenstatic void replace_nul_bytes(buffer_t *buf)
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen{
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen unsigned char *data;
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen size_t i, size;
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen data = buffer_get_modifiable_data(buf, &size);
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen for (i = 0; i < size; i++) {
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen if (data[i] == '\0')
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen data[i] = ' ';
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen }
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen}
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainenvoid fts_parser_more(struct fts_parser *parser, struct message_block *block)
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen{
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen if (parser->v.more != NULL)
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen parser->v.more(parser, block);
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen if (!uni_utf8_data_is_valid(block->data, block->size) ||
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen data_has_nuls(block->data, block->size)) {
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen /* output isn't valid UTF-8. make it. */
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen if (parser->utf8_output == NULL) {
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen parser->utf8_output =
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen buffer_create_dynamic(default_pool, 4096);
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen } else {
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen buffer_set_used_size(parser->utf8_output, 0);
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen }
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen (void)uni_utf8_get_valid_data(block->data, block->size,
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen parser->utf8_output);
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen replace_nul_bytes(parser->utf8_output);
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen block->data = parser->utf8_output->data;
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen block->size = parser->utf8_output->used;
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen }
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen}
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen
72d301f5751986396533088b769a9e74735a467cSergey Kitovint fts_parser_deinit(struct fts_parser **_parser, const char **retriable_err_msg_r)
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen{
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen struct fts_parser *parser = *_parser;
76f255362c881924e735581e54bdd7863b684eccTimo Sirainen int ret = 1;
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen *_parser = NULL;
ecc5a1a440799a0966c26da956f5d6e2d8073e03Timo Sirainen
6307d76096764e66bddc63d4a3e5a1aa19cc528fJosef 'Jeff' Sipek buffer_free(&parser->utf8_output);
f4c18fa75b5a1feebde68986bc7f9bc19f0afe59Timo Sirainen if (parser->v.deinit != NULL) {
f4c18fa75b5a1feebde68986bc7f9bc19f0afe59Timo Sirainen const char *error = NULL;
f4c18fa75b5a1feebde68986bc7f9bc19f0afe59Timo Sirainen ret = parser->v.deinit(parser, &error);
f4c18fa75b5a1feebde68986bc7f9bc19f0afe59Timo Sirainen if (ret == 0) {
f4c18fa75b5a1feebde68986bc7f9bc19f0afe59Timo Sirainen i_assert(error != NULL);
f4c18fa75b5a1feebde68986bc7f9bc19f0afe59Timo Sirainen if (retriable_err_msg_r != NULL)
f4c18fa75b5a1feebde68986bc7f9bc19f0afe59Timo Sirainen *retriable_err_msg_r = error;
f4c18fa75b5a1feebde68986bc7f9bc19f0afe59Timo Sirainen }
f4c18fa75b5a1feebde68986bc7f9bc19f0afe59Timo Sirainen } else
4d27f95c76bd008bb38f9c442567046da2b6ce14Timo Sirainen i_free(parser);
5518182f1165884742f9eb37eb2e6136b29394e8Timo Sirainen return ret;
772120713c176bde8c932b3fb4c413d223741fcdTimo Sirainen}
3ad57148af55e45cc45401162d3460ed0b237a10Timo Sirainen
3ad57148af55e45cc45401162d3460ed0b237a10Timo Sirainenvoid fts_parsers_unload(void)
3ad57148af55e45cc45401162d3460ed0b237a10Timo Sirainen{
3ad57148af55e45cc45401162d3460ed0b237a10Timo Sirainen unsigned int i;
3ad57148af55e45cc45401162d3460ed0b237a10Timo Sirainen
3ad57148af55e45cc45401162d3460ed0b237a10Timo Sirainen for (i = 0; i < N_ELEMENTS(parsers); i++) {
3ad57148af55e45cc45401162d3460ed0b237a10Timo Sirainen if (parsers[i]->unload != NULL)
3ad57148af55e45cc45401162d3460ed0b237a10Timo Sirainen parsers[i]->unload();
3ad57148af55e45cc45401162d3460ed0b237a10Timo Sirainen }
3ad57148af55e45cc45401162d3460ed0b237a10Timo Sirainen}