src/lib-mail/message-body-search.c

	message-body-search.c revision cd56a23e21f1df3f79648cf07e2f4385e2fadebb
/* Copyright (C) 2002 Timo Sirainen */

#include "lib.h"
#include "base64.h"
#include "buffer.h"
#include "istream.h"
#include "strescape.h"
#include "charset-utf8.h"
#include "quoted-printable.h"
#include "message-parser.h"
#include "message-content-parser.h"
#include "message-header-search.h"
#include "message-body-search.h"

#define DECODE_BLOCK_SIZE 8192

struct body_search_context {
	pool_t pool;

	const char *key;
	size_t key_len;

	const char *charset;
	unsigned int unknown_charset:1;
	unsigned int search_header:1;
};

struct part_search_context {
	struct body_search_context *body_ctx;

	struct charset_translation *translation;

	buffer_t *decode_buf;
	buffer_t *match_buf;

	char *content_type;
	char *content_charset;

	unsigned int content_qp:1;
	unsigned int content_base64:1;
	unsigned int content_unknown:1;
	unsigned int content_type_text:1; /* text/any or message/any */
	unsigned int ignore_header:1;
};

static void parse_content_type(const unsigned char *value, size_t value_len,
			       void *context)
{
	struct part_search_context *ctx = context;

	if (ctx->content_type == NULL) {
		ctx->content_type = i_strndup(value, value_len);
		ctx->content_type_text =
			strncasecmp(ctx->content_type, "text/", 5) == 0 ||
			strncasecmp(ctx->content_type, "message/", 8) == 0;
	}
}

static void
parse_content_type_param(const unsigned char *name, size_t name_len,
			 const unsigned char *value, size_t value_len,
			 bool value_quoted, void *context)
{
	struct part_search_context *ctx = context;

	if (name_len == 7 && memcasecmp(name, "charset", 7) == 0 &&
	    ctx->content_charset == NULL) {
		ctx->content_charset = i_strndup(value, value_len);
		if (value_quoted) str_unescape(ctx->content_charset);
	}
}

static void parse_content_encoding(const unsigned char *value, size_t value_len,
				   void *context)
{
	struct part_search_context *ctx = context;

	switch (value_len) {
	case 4:
		if (memcasecmp(value, "7bit", 4) != 0 &&
		    memcasecmp(value, "8bit", 4) != 0)
			ctx->content_unknown = TRUE;
		break;
	case 6:
		if (memcasecmp(value, "base64", 6) == 0)
			ctx->content_base64 = TRUE;
		else if (memcasecmp(value, "binary", 6) != 0)
			ctx->content_unknown = TRUE;
		break;
	case 16:
		if (memcasecmp(value, "quoted-printable", 16) == 0)
			ctx->content_qp = TRUE;
		else
			ctx->content_unknown = TRUE;
		break;
	default:
		ctx->content_unknown = TRUE;
		break;
	}
}

static bool message_search_header(struct part_search_context *ctx,
				  struct istream *input)
{
	struct header_search_context *hdr_search_ctx;
	struct message_header_parser_ctx *hdr_ctx;
	struct message_header_line *hdr;
	int ret;
	bool found = FALSE;

	hdr_search_ctx = message_header_search_init(pool_datastack_create(),
						    ctx->body_ctx->key,
						    "UTF-8", NULL);
	/* Our key is in UTF-8. It can't be invalid. */
	i_assert(hdr_search_ctx != NULL);

	/* we default to text content-type */
	ctx->content_type_text = TRUE;

	hdr_ctx = message_parse_header_init(input, NULL, TRUE);
	while ((ret = message_parse_header_next(hdr_ctx, &hdr)) > 0) {
		if (hdr->eoh)
			continue;

		if (!ctx->ignore_header) {
			if (message_header_search(hdr->value, hdr->value_len,
						  hdr_search_ctx)) {
				found = TRUE;
				break;
			}
		}

		if (hdr->name_len == 12 &&
		    strcasecmp(hdr->name, "Content-Type") == 0) {
			if (hdr->continues) {
				hdr->use_full_value = TRUE;
				continue;
			}
			message_content_parse_header(hdr->full_value,
						     hdr->full_value_len,
						     parse_content_type,
						     parse_content_type_param,
						     ctx);
		} else if (hdr->name_len == 25 &&
			   strcasecmp(hdr->name,
				      "Content-Transfer-Encoding") == 0) {
			if (hdr->continues) {
				hdr->use_full_value = TRUE;
				continue;
			}
			message_content_parse_header(hdr->full_value,
						     hdr->full_value_len,
						     parse_content_encoding,
						     NULL, ctx);
		}
	}
	i_assert(ret != 0);
	message_parse_header_deinit(&hdr_ctx);

	return found;
}

static bool message_search_decoded_block(struct part_search_context *ctx,
					 buffer_t *block)
{
	const unsigned char *p, *end, *key;
	size_t key_len, block_size, *matches, match_count, value;
	ssize_t i;

	key = (const unsigned char *) ctx->body_ctx->key;
	key_len = ctx->body_ctx->key_len;

	matches = buffer_get_modifyable_data(ctx->match_buf, &match_count);
	match_count /= sizeof(size_t);

	p = buffer_get_data(block, &block_size);
	end = p + block_size;
	for (; p != end; p++) {
		for (i = match_count-1; i >= 0; i--) {
			if (key[matches[i]] == *p) {
				if (++matches[i] == key_len) {
					/* full match */
					p++;
					return TRUE;
				}
			} else {
				/* non-match */
				buffer_delete(ctx->match_buf,
					      i * sizeof(size_t),
					      sizeof(size_t));
				match_count--;
			}
		}

		if (*p == key[0]) {
			if (key_len == 1) {
				/* only one character in search key */
				p++;
				return TRUE;
			}

			value = 1;
			buffer_append(ctx->match_buf, &value, sizeof(value));
			match_count++;
		}
	}

	return FALSE;
}

/* returns 1 = found, 0 = not found, -1 = error in input data */
static int message_search_body_block(struct part_search_context *ctx,
				     buffer_t *block)
{
	const unsigned char *inbuf;
	buffer_t *outbuf;
        enum charset_result result;
	size_t block_pos, inbuf_size, inbuf_left;

	outbuf = buffer_create_static_hard(pool_datastack_create(),
					   DECODE_BLOCK_SIZE);
	for (block_pos = 0; block_pos < buffer_get_used_size(block); ) {
		if (buffer_get_used_size(ctx->decode_buf) == 0) {
			/* we can use the buffer directly without copying */
			inbuf = buffer_get_data(block, &inbuf_size);
			inbuf += block_pos; inbuf_size -= block_pos;
			block_pos += buffer_get_used_size(block);
		} else {
			/* some characters already in buffer, ie. last
			   conversion contained partial data */
			buffer_append_buf(ctx->decode_buf, block,
					  block_pos, block->used);
                        block_pos += block->used;

			inbuf = buffer_get_data(ctx->decode_buf, &inbuf_size);
		}

		buffer_set_used_size(outbuf, 0);
		inbuf_left = inbuf_size;
		result = charset_to_ucase_utf8(ctx->translation,
					       inbuf, &inbuf_size, outbuf);
		inbuf_left -= inbuf_size;

		switch (result) {
		case CHARSET_RET_OUTPUT_FULL:
			/* we should have copied the incomplete sequence.. */
			i_assert(inbuf_left <= block_pos);
			/* fall through */
		case CHARSET_RET_OK:
			buffer_set_used_size(ctx->decode_buf, 0);
			block_pos -= inbuf_left;
			break;
		case CHARSET_RET_INCOMPLETE_INPUT:
			/* save the partial sequence to buffer */
			buffer_write(ctx->decode_buf, 0,
				     inbuf + inbuf_size, inbuf_left);
			buffer_set_used_size(ctx->decode_buf, inbuf_left);
			break;

		case CHARSET_RET_INVALID_INPUT:
			return -1;
		}

		if (message_search_decoded_block(ctx, outbuf))
			return 1;
	}

	return 0;
}

static bool message_search_body(struct part_search_context *ctx,
				struct istream *input,
				const struct message_part *part)
{
	const unsigned char *data;
	buffer_t *decodebuf;
	pool_t pool;
	size_t data_size, pos;
	ssize_t ret;
	bool found;

	if (ctx->content_unknown) {
		/* unknown content-encoding-type, ignore */
		return FALSE;
	}

	if (!ctx->content_type_text) {
		/* non-text content, ignore - FIXME: should be configurable? */
		return FALSE;
	}

	ctx->translation = ctx->content_charset == NULL ? NULL :
		charset_to_utf8_begin(ctx->content_charset, NULL);
	if (ctx->translation == NULL)
		ctx->translation = charset_to_utf8_begin("ascii", NULL);

	ctx->decode_buf = buffer_create_dynamic(default_pool, 256);
	ctx->match_buf = buffer_create_static_hard(pool_datastack_create(),
						   sizeof(size_t) *
						   ctx->body_ctx->key_len);

	input = i_stream_create_limit(default_pool, input,
				      part->physical_pos +
				      part->header_size.physical_size,
				      part->body_size.physical_size);
	i_stream_seek(input, 0);

	found = FALSE; pos = 0;
	while (i_stream_read_data(input, &data, &data_size, pos) > 0) {
		/* limit the size of t_malloc()s */
		if (data_size > DECODE_BLOCK_SIZE)
			data_size = DECODE_BLOCK_SIZE;
		pos = data_size;

		t_push();
		pool = pool_datastack_create();
		if (ctx->content_qp) {
			decodebuf = buffer_create_static_hard(pool, data_size);
			quoted_printable_decode(data, data_size,
						&data_size, decodebuf);
		} else if (ctx->content_base64) {
			size_t size = MAX_BASE64_DECODED_SIZE(data_size);
			decodebuf = buffer_create_static_hard(pool, size);

			if (base64_decode(data, data_size,
					  &data_size, decodebuf) < 0) {
				/* corrupted base64 data, don't bother with
				   the rest of it */
				t_pop();
				break;
			}
		} else {
			decodebuf = buffer_create_const_data(pool, data,
							     data_size);
		}

		ret = message_search_body_block(ctx, decodebuf);
		t_pop();

		if (ret != 0) {
			found = ret > 0;
			break;
		}

		i_stream_skip(input, data_size);
		pos -= data_size;
	}

	i_stream_destroy(&input);

	if (ctx->translation != NULL)
		charset_to_utf8_end(&ctx->translation);
	buffer_free(ctx->decode_buf);
	return found;
}

static bool
message_body_search_init(struct body_search_context *ctx,
			 const char *key, const char *charset,
			 bool *unknown_charset_r, bool search_header)
{
	size_t key_len;

	memset(ctx, 0, sizeof(struct body_search_context));

	/* get the key uppercased */
	key = charset_to_ucase_utf8_string(charset, unknown_charset_r,
					   (const unsigned char *) key,
					   strlen(key), &key_len);
	if (key == NULL)
		return FALSE;

	ctx->key = key;
	ctx->key_len = key_len;
	ctx->charset = charset;
	ctx->unknown_charset = charset == NULL;
	ctx->search_header = search_header;

	i_assert(ctx->key_len <= SSIZE_T_MAX/sizeof(size_t));

	return TRUE;
}

static int message_body_search_ctx(struct body_search_context *ctx,
				   struct istream *input,
				   const struct message_part *part)
{
	struct part_search_context part_ctx;
	int ret;

	ret = 0;
	while (part != NULL && ret == 0) {
		i_assert(input->v_offset <= part->physical_pos);

		i_stream_skip(input, part->physical_pos - input->v_offset);

		memset(&part_ctx, 0, sizeof(part_ctx));
		part_ctx.body_ctx = ctx;
		part_ctx.ignore_header =
			part->parent == NULL && !ctx->search_header;

		t_push();

		if (message_search_header(&part_ctx, input)) {
			/* found / invalid search key */
			ret = 1;
		} else if (part->children != NULL) {
			/* multipart/xxx or message/rfc822 */
			if (message_body_search_ctx(ctx, input, part->children))
				ret = 1;
		} else {
			if (input->v_offset != part->physical_pos +
			    part->header_size.physical_size) {
				/* header size changed. */
				ret = -1;
			} else if (message_search_body(&part_ctx, input, part))
				ret = 1;
		}

		i_free(part_ctx.content_type);
		i_free(part_ctx.content_charset);

		t_pop();

		part = part->next;
	}

	return ret;
}

int message_body_search(const char *key, const char *charset,
			struct istream *input,
			const struct message_part *part, bool search_header,
                        enum message_body_search_error *error_r)
{
        struct body_search_context ctx;
	int ret;
	bool unknown_charset;

	if (!message_body_search_init(&ctx, key, charset, &unknown_charset,
				      search_header)) {
		*error_r = unknown_charset ?
			MESSAGE_BODY_SEARCH_ERROR_UNKNOWN_CHARSET :
                        MESSAGE_BODY_SEARCH_ERROR_INVALID_KEY;
		return -1;
	}

	if ((ret = message_body_search_ctx(&ctx, input, part)) < 0)
		*error_r = MESSAGE_BODY_SEARCH_ERROR_MESSAGE_PART_BROKEN;
	return ret;
}