src/lib-mail/message-tokenize.c

	message-tokenize.c revision 7c424aa51c956c628e3512055841aa2f9eef4833
/* Copyright (C) 2002 Timo Sirainen */

#include "lib.h"
#include "str.h"
#include "strescape.h"
#include "message-tokenize.h"

struct message_tokenizer {
	const unsigned char *data;
	size_t size;

	message_tokenize_error_callback_t *error_cb;
	void *error_context;

	int token;
	size_t token_pos, token_len;
	size_t parse_pos;

	unsigned int skip_comments:1;
	unsigned int dot_token:1;

	unsigned int in_bracket:1;
};

#define PARSE_ERROR() \
	STMT_START { \
	if (tok->error_cb != NULL && \
	    !tok->error_cb(data, i, '\0', tok->error_context)) { \
		tok->token = TOKEN_LAST; \
		return TOKEN_LAST; \
	} \
	} STMT_END

#define PARSE_ERROR_MISSING(c) \
	STMT_START { \
	if (tok->error_cb != NULL && \
	    !tok->error_cb(data, i, c, tok->error_context)) { \
		tok->token = TOKEN_LAST; \
		return TOKEN_LAST; \
	} \
	} STMT_END


struct message_tokenizer *
message_tokenize_init(const unsigned char *data, size_t size,
		      message_tokenize_error_callback_t *error_cb,
		      void *error_context)
{
	struct message_tokenizer *tok;

	tok = i_new(struct message_tokenizer, 1);
	tok->data = data;
	tok->size = size;

	tok->error_cb = error_cb;
	tok->error_context = error_context;

	tok->skip_comments = TRUE;
	tok->dot_token = TRUE;

	tok->token = -1;
	return tok;
}

void message_tokenize_deinit(struct message_tokenizer *tok)
{
	i_free(tok);
}

void message_tokenize_skip_comments(struct message_tokenizer *tok, int set)
{
	tok->skip_comments = set;
}

void message_tokenize_dot_token(struct message_tokenizer *tok, int set)
{
	tok->dot_token = set;
}

enum message_token message_tokenize_next(struct message_tokenizer *tok)
{
	int token, level, last_atom;
	const unsigned char *data;
	size_t i, size;

	if (tok->token == TOKEN_LAST)
		return TOKEN_LAST;

	data = tok->data;
	size = tok->size;

	tok->token = TOKEN_LAST;

	last_atom = FALSE;
	for (i = tok->parse_pos; i < size && data[i] != '\0'; i++) {
		token = -1;
		switch (data[i]) {
		case ' ':
		case '\t':
		case '\r':
		case '\n':
			/* skip whitespace */
			break;

		case '(':
			/* (comment) - nesting is allowed */
			if (last_atom)
				break;

			token = '(';
			tok->token_pos = ++i;

			level = 1;
			for (; i < size && data[i] != '\0'; i++) {
				if (data[i] == '\\' &&
				    i+1 < size && data[i+1] != '\0')
					i++;
				else if (data[i] == '(')
					level++;
				else if (data[i] == ')') {
					if (--level == 0)
						break;
				}
			}

			if (level > 0)
				PARSE_ERROR_MISSING(')');

			tok->token_len = (size_t) (i - tok->token_pos);
			break;

		case '[':
			/* domain literal - nesting isn't allowed */
			if (last_atom)
				break;

			token = '[';
			tok->token_pos = ++i;

			while (i < size && data[i] != '\0' && data[i] != ']') {
				if (data[i] == '\\' &&
				    i+1 < size && data[i+1] != '\0')
					i++;
				else if (data[i] == '[') {
					/* nesting not allowed, but
					   continue anyway */
					PARSE_ERROR();
				}

				i++;
			}

			if (i == size || data[i] == '\0')
				PARSE_ERROR_MISSING(']');

			tok->token_len = (size_t) (i - tok->token_pos);
			break;

		case '"':
			/* quoted string */
			if (last_atom)
				break;

			token = '"';
			tok->token_pos = ++i;

			while (i < size && data[i] != '\0' && data[i] != '"') {
				if (data[i] == '\\' &&
				    i+1 < size && data[i+1] != '\0')
					i++;
				i++;
			}

			if (i == size || data[i] == '\0')
				PARSE_ERROR_MISSING('"');

			tok->token_len = (size_t) (i - tok->token_pos);
			break;

		case '<':
			if (last_atom)
				break;

			if (tok->in_bracket) {
				/* '<' cannot be nested */
				PARSE_ERROR();
			}

			token = '<';
			tok->in_bracket = TRUE;
			break;
		case '>':
			if (last_atom)
				break;

			if (!tok->in_bracket) {
				/* missing '<' */
                                PARSE_ERROR();
			}

			token = '>';
			tok->in_bracket = FALSE;
			break;

		case ')':
		case ']':
		case '\\':
			PARSE_ERROR();
			/* fall through */

		/* RFC822 specials: */
		case '@':
		case ',':
		case ';':
		case ':':
		case '.':
		/* RFC 2045 specials: */
		case '/':
		case '?':
		case '=':
			token = tok->data[i];
			if (token != '.' || tok->dot_token)
				break;
			/* fall through */
		default:
			/* atom */
			token = 'A';
			if (!last_atom) {
				tok->token = token;
				tok->token_pos = i;
				last_atom = TRUE;
			}
			break;
		}

		if (last_atom) {
			if (token != 'A') {
				/* end of atom */
				tok->token_len = (size_t) (i - tok->token_pos);
				last_atom = FALSE;
				break;
			}
		} else {
			if (token != -1) {
				tok->token = token;
				if (i < tok->size && data[i] != '\0')
					i++;
				break;
			}
		}

		if (i == tok->size || data[i] == '\0') {
			/* unexpected eol */
			break;
		}
	}

	if (last_atom) {
		/* end of atom */
		tok->token_len = (size_t) (i - tok->token_pos);
	}

	tok->parse_pos = i;

	if (tok->token == TOKEN_LAST && tok->in_bracket &&
	    tok->error_cb != NULL) {
		if (tok->error_cb(data, i, '>', tok->error_context))
			tok->token = TOKEN_LAST;
	}

	return tok->token;
}

enum message_token message_tokenize_get(const struct message_tokenizer *tok)
{
	return tok->token;
}

size_t message_tokenize_get_parse_position(const struct message_tokenizer *tok)
{
	return tok->parse_pos;
}

const unsigned char *
message_tokenize_get_value(const struct message_tokenizer *tok, size_t *len)
{
	i_assert(IS_TOKEN_STRING(tok->token));

	*len = tok->token_len;
	return tok->data + tok->token_pos;
}

void message_tokenize_get_string(struct message_tokenizer *tok,
				 string_t *str, string_t *comments,
				 const enum message_token *stop_tokens)
{
	enum message_token token;
	const unsigned char *value;
	size_t len;
	int i, token_str, last_str;

	last_str = FALSE;
	while ((token = message_tokenize_next(tok)) != TOKEN_LAST) {
		for (i = 0; stop_tokens[i] != TOKEN_LAST; i++)
			if (token == stop_tokens[i])
				return;

		if (token == TOKEN_COMMENT) {
			/* handle comment specially */
			if (comments != NULL) {
				if (str_len(comments) > 0)
					str_append_c(comments, ' ');

				value = message_tokenize_get_value(tok, &len);
				str_append_unescaped(comments, value, len);
			}
			continue;
		}

		token_str = token == TOKEN_ATOM || token == TOKEN_QSTRING ||
			token == TOKEN_DLITERAL || token == TOKEN_COMMENT;

		if (!token_str)
			str_append_c(str, token);
		else if (token == TOKEN_QSTRING) {
			/* unescape only quoted strings, since we're removing
			   the quotes. for domain literals I don't see much
			   point in unescaping if [] is still kept.. */
			if (last_str)
				str_append_c(str, ' ');

			value = message_tokenize_get_value(tok, &len);
			str_append_unescaped(str, value, len);
		} else {
			if (last_str)
				str_append_c(str, ' ');

			if (token == TOKEN_DLITERAL)
				str_append_c(str, '[');

			value = message_tokenize_get_value(tok, &len);
			str_append_n(str, value, len);

			if (token == TOKEN_DLITERAL)
				str_append_c(str, ']');
		}

		last_str = token_str;
	}
}