src/lib-smtp/smtp-parser.c

/* Copyright (c) 2013-2018 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "net.h"
#include "str.h"
#include "strescape.h"

#include "smtp-parser.h"

#include <ctype.h>

/* Character definitions from RFC 5321/5322:

   textstring  = 1*(%d09 / %d32-126) ; HT, SP, Printable US-ASCII
               = 1*(%x09 / %x20-7e)
   ehlo-param  = 1*(%d33-126)
               = 1*(%x21-7e)
   ehlo-greet  = 1*(%d0-9 / %d11-12 / %d14-127)
               = 1*(%x00-09 / %x0b-0c / %x0e-7f)
   qtext       = %d32-33 / %d35-91 / %d93-126
               = %x20-21 / %x23-5B / %x5d-7e
   quoted-pair = %d92 %d32-126
               = %x5c %x20-7e
   atext       = ALPHA / DIGIT /    ; Printable US-ASCII
                 "!" / "#" /        ;  characters not including
                 "$" / "%" /        ;  specials.  Used for atoms.
                 "&" / "'" /
                 "*" / "+" /
                 "-" / "/" /
                 "=" / "?" /
                 "^" / "_" /
                 "`" / "{" /
                 "|" / "}" /
                 "~"
               = %x21 / %x23-27 / %x2a-2b / %x2d / %x2f-39 / %x3d /
                 %d3f / %x41-5a / %x5e-7e /
   esmtp-value = 1*(%d33-60 / %d62-126)
               = 1*(%x21-3c / %x3e-7e)
   dcontent    = %d33-90 / ; Printable US-ASCII
                 %d94-126  ; excl. "[", "\", "]"
               = %x21-5a / %x5e-7e
   xchar       = any ASCII CHAR between "!" (33) and "~" (126) inclusive,
                 except for "+" and "=". [RFC 3461]
               = %x21-2a / %2c-3c / %x3e-7e

   Bit mappings (FIXME: rearrange):

   (1<<0) => %x21-2a / %2c-3c / %x3e-7e  (xtext)
   (1<<1) => %x21 / %x23-27 / %x2a-2b / %x2d / %x2f-39 / %x3d /
               %d3f / %x41-5a / %x5e-7e /
   (1<<2) => %x28-29 / %x2c / %x2e / %x3a-3c / %x3e / %x40
   (1<<8) => %x00-09 / %x0b-0c / %x0e-20 / %x7f
   (1<<5) => %x09 / %5b-5d
   (1<<4) => %x5b / %x5d
   (1<<3) => %x20
   (1<<9) => %x22
   (1<<6) => %x2b
   (1<<7) => %x3d
 */

/* xtext */
const uint16_t smtp_xtext_char_mask = (1<<0);
/* atext */
const uint16_t smtp_atext_char_mask = (1<<1);
/* dcontent */
const uint16_t smtp_dcontent_char_mask = (1<<1)|(1<<2)|(1<<9);
/* qtext */
const uint16_t smtp_qtext_char_mask = (1<<1)|(1<<2)|(1<<3)|(1<<4);
/* textstring */
const uint16_t smtp_textstr_char_mask = (1<<1)|(1<<2)|(1<<9)|(1<<3)|(1<<5);
/* esmtp-value */
const uint16_t smtp_esmtp_value_char_mask = (1<<0)|(1<<6);
/* ehlo-param */
const uint16_t smtp_ehlo_param_char_mask = (1<<0)|(1<<6)|(1<<7);
/* ehlo-greet */
const uint16_t smtp_ehlo_greet_char_mask = (1<<0)|(1<<6)|(1<<7)|(1<<8);
/* quoted-pair */
const uint16_t smtp_qpair_char_mask = (1<<0)|(1<<3)|(1<<6)|(1<<7);

const uint16_t smtp_char_lookup[256] = {
    0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, // 00
    0x100, 0x120, 0x000, 0x100, 0x100, 0x000, 0x100, 0x100, // 08
    0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, // 10
    0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, // 18
    0x108, 0x003, 0x201, 0x003, 0x003, 0x003, 0x003, 0x003, // 20
    0x005, 0x005, 0x003, 0x042, 0x005, 0x003, 0x005, 0x003, // 28
    0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 30
    0x003, 0x003, 0x005, 0x005, 0x005, 0x082, 0x005, 0x003, // 38
    0x005, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 40
    0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 48
    0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 50
    0x003, 0x003, 0x003, 0x031, 0x021, 0x031, 0x003, 0x003, // 58
    0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 60
    0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 68
    0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 70
    0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x100, // 78

    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // 80
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // 88
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // 90
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // 98
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // a0
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // a8
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // b0
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // b8
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // c0
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // c8
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // d0
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // d8
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // e0
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // e8
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // f0
    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // f8
};

/*
 * Parser
 */

void smtp_parser_init(struct smtp_parser *parser,
    pool_t pool, const char *data)
{
    parser->pool = pool;
    parser->begin = parser->cur = (unsigned char *)data;
    parser->end = (unsigned char *)data + strlen(data);
    parser->error = NULL;
}

/*
 * Common syntax
 */

static int
smtp_parser_parse_ldh_str(struct smtp_parser *parser,
    string_t *out)
{
    const unsigned char *pbegin = parser->cur, *palnum;

    /* Ldh-str = *( ALPHA / DIGIT / "-" ) Let-dig
       Let-dig = ALPHA / DIGIT
     */

    /* Ldh-str */
    palnum = NULL;
    while (parser->cur < parser->end) {
        if (i_isalnum(*parser->cur))
            palnum = parser->cur;
        else if (*parser->cur != '-')
            break;
        parser->cur++;
    }
    if (parser->cur == pbegin || palnum == NULL) {
        parser->cur = pbegin;
        return 0;
    }

    parser->cur = palnum+1;
    if (out != NULL)
        str_append_n(out, pbegin, parser->cur - pbegin);
    return 1;
}

int smtp_parser_parse_domain(struct smtp_parser *parser,
    const char **value_r)
{
    string_t *value = NULL;

    /* Domain     = sub-domain *("." sub-domain)
       sub-domain = Let-dig [Ldh-str]
       Let-dig    = ALPHA / DIGIT
       Ldh-str    = *( ALPHA / DIGIT / "-" ) Let-dig

       NOTE: A more generic syntax is accepted to be lenient towards
             systems that don't adhere to the standards. It allows
             '-' and '_' to occur anywhere in a sub-domain.
     */

    /* Let-dig (first) (nope) */
    if (parser->cur >= parser->end ||
        (!i_isalnum(*parser->cur) && *parser->cur != '-' &&
            *parser->cur != '_'))
        return 0;

    if (value_r != NULL)
        value = t_str_new(256);

    for (;;) {
        /* Let-dig (nope) */
        if (parser->cur >= parser->end || *parser->cur == '.') {
            parser->error = "Empty sub-domain";
            return -1;
        }
        if (!i_isalnum(*parser->cur) && *parser->cur != '-' &&
            *parser->cur != '_') {
            parser->error = "Invalid character in domain";
            return -1;
        }
        if (value_r != NULL)
            str_append_c(value, *parser->cur);
        parser->cur++;

        /* Ldh-str (nope) */
        while (parser->cur < parser->end) {
            if (!i_isalnum(*parser->cur) && *parser->cur != '-' &&
                *parser->cur != '_')
                break;

            if (value_r != NULL)
                str_append_c(value, *parser->cur);
            parser->cur++;
        }

        /* *("." sub-domain) */
        if (parser->cur >= parser->end || *parser->cur != '.')
            break;

        if (value_r != NULL)
            str_append_c(value, '.');
        parser->cur++;
    }

    if (value_r != NULL)
        *value_r = str_c(value);
    return 1;
}

static int
smtp_parser_parse_snum(struct smtp_parser *parser, string_t *literal,
               uint8_t *octet_r)
{
    const unsigned char *pbegin = parser->cur;
    uint8_t octet = 0;

    /* Snum                    = 1*3DIGIT
                               ; representing a decimal integer
                               ; value in the range 0 through 255
     */

    if (*parser->cur < '0' || *parser->cur > '9')
        return 0;
    do {
        if (octet >= ((uint8_t)-1 / 10)) {
            if (octet > (uint8_t)-1 / 10)
                return -1;
            if ((uint8_t)(*parser->cur - '0') > ((uint8_t)-1 % 10))
                return -1;
        }
        octet = octet * 10 + (*parser->cur - '0');
        parser->cur++;
    } while (*parser->cur >= '0' && *parser->cur <= '9');

    if (literal != NULL)
        str_append_n(literal, pbegin, parser->cur - pbegin);
    *octet_r = octet;
    return 1;
}

static int
smtp_parser_parse_ipv4_address(struct smtp_parser *parser,
                   string_t *literal, struct in_addr *ip4_r)
{
    uint8_t octet;
    uint32_t ip = 0;
    int ret;
    int i;

    /* IPv4-address-literal    = Snum 3("."  Snum) */
    if ((ret = smtp_parser_parse_snum(parser, literal, &octet)) <= 0)
        return ret;
    ip = octet;

    for (i = 0; i < 3 && parser->cur < parser->end; i++) {
        if (*parser->cur != '.')
            return -1;

        if (literal != NULL)
            str_append_c(literal, '.');
        parser->cur++;

        if ((ret = smtp_parser_parse_snum(parser,
            literal, &octet)) <= 0)
            return -1;
        ip = (ip << 8) + octet;
    }

    if (ip4_r != NULL)
        ip4_r->s_addr = htonl(ip);
    return 1;
}

int smtp_parser_parse_address_literal(struct smtp_parser *parser,
    const char **value_r, struct ip_addr *ip_r)
{
    const unsigned char *pblock;
    struct in_addr ip4;
    struct in6_addr ip6;
    bool ipv6 = FALSE;
    string_t *value = NULL, *tagbuf;
    int ret;

    /* address-literal         = "[" ( IPv4-address-literal /
                                IPv6-address-literal /
                                General-address-literal ) "]"
                               ; See Section 4.1.3

       IPv6-address-literal    = "IPv6:" IPv6-addr
       General-address-literal = Standardized-tag ":" 1*dcontent
       Standardized-tag        = Ldh-str
                               ; Standardized-tag MUST be specified in a
                               ; Standards-Track RFC and registered with
                               ; IANA
       dcontent                = %d33-90 / ; Printable US-ASCII
                                 %d94-126 ; excl. "[", "\", "]"
     */

    /* "[" */
    if (parser->cur >= parser->end || *parser->cur != '[')
        return 0;
    parser->cur++;

    if (value_r != NULL) {
        value = t_str_new(128);
        str_append_c(value, '[');
    }
    if (ip_r != NULL)
        i_zero(ip_r);

    /* IPv4-address-literal / ... */
    i_zero(&ip4);
    if ((ret=smtp_parser_parse_ipv4_address(parser, value, &ip4)) != 0) {
        if (ret < 0) {
            parser->error = "Invalid IPv4 address literal";
            return -1;
        }
        if (ip_r != NULL) {
            ip_r->family = AF_INET;
            ip_r->u.ip4 = ip4;
        }

    /* ... / IPv6-address-literal / General-address-literal */
    } else {
        /* IPv6-address-literal    = "IPv6:" IPv6-addr
           General-address-literal = Standardized-tag ":" 1*dcontent
           Standardized-tag        = Ldh-str
         */
        if (value_r != NULL) {
            tagbuf = value;
        } else {
            tagbuf = t_str_new(16);
            str_append_c(tagbuf, '[');
        }
        if ((ret=smtp_parser_parse_ldh_str(parser, tagbuf)) <= 0 ||
            parser->cur >= parser->end || *parser->cur != ':') {
            parser->error = "Invalid address literal";
            return -1;
        }
        if (strcasecmp(str_c(tagbuf)+1, "IPv6") == 0)
            ipv6 = TRUE;
        else if (value_r == NULL) {
            parser->error = t_strdup_printf(
                "Unsupported %s address literal",
                str_c(tagbuf)+1);
            return -1;
        }
        parser->cur++;
        if (value_r != NULL)
            str_append_c(value, ':');

        /* 1*dcontent */
        pblock = parser->cur;
        while (parser->cur < parser->end &&
            smtp_char_is_dcontent(*parser->cur))
            parser->cur++;

        if (parser->cur == pblock) {
            parser->error = "Empty address literal";
            return -1;
        }
        if (value_r != NULL)
            str_append_n(value, pblock, parser->cur - pblock);

        if (ipv6) {
            i_zero(&ip6);
            if ((ret = inet_pton(AF_INET6, t_strndup(pblock,
                parser->cur - pblock), &ip6)) <= 0) {
                parser->error = "Invalid IPv6 address literal";
                return -1;
            }
            if (ip_r != NULL) {
                ip_r->family = AF_INET6;
                ip_r->u.ip6 = ip6;
            }
        }
    }

    /* ']' */
    if (parser->cur >= parser->end) {
        parser->error = "Missing ']' at end of address literal";
        return -1;
    } else if (*parser->cur != ']') {
        parser->error = "Invalid character in address literal";
        return -1;
    }

    parser->cur++;
    if (value_r != NULL) {
        str_append_c(value, ']');
        *value_r = str_c(value);
    }
    return 1;
}

int smtp_parser_parse_quoted_string(struct smtp_parser *parser,
    const char **value_r)
{
    string_t *value = NULL;
    const unsigned char *pbegin;

    /* Quoted-string    = DQUOTE *QcontentSMTP DQUOTE
       QcontentSMTP     = qtextSMTP / quoted-pairSMTP
       quoted-pairSMTP  = %d92 %d32-126
                        ; i.e., backslash followed by any ASCII
                        ; graphic (including itself) or SPace
       qtextSMTP        = %d32-33 / %d35-91 / %d93-126
                        ; i.e., within a quoted string, any
                        ; ASCII graphic or space is permitted
                        ; without blackslash-quoting except
                        ; double-quote and the backslash itself.
     */

    /* DQUOTE */
    if (parser->cur >= parser->end || *parser->cur != '"')
        return 0;
    parser->cur++;

    if (value_r != NULL)
        value = t_str_new(256);

    /* *QcontentSMTP */
    while (parser->cur < parser->end) {
        pbegin = parser->cur;
        while (parser->cur < parser->end &&
            smtp_char_is_qtext(*parser->cur)) {
            /* qtextSMTP */
            parser->cur++;
        }

        if (value_r != NULL)
            str_append_n(value, pbegin, parser->cur - pbegin);

        if (parser->cur >= parser->end || *parser->cur != '\\')
            break;
        parser->cur++;

        /* quoted-pairSMTP */
        if (parser->cur >= parser->end ||
            !smtp_char_is_qpair(*parser->cur)) {
            parser->error =
                "Invalid character after '\\' in quoted string";
            return -1;
        }

        if (value_r != NULL)
            str_append_c(value, *parser->cur);
        parser->cur++;
    }

    /* DQUOTE */
    if (parser->cur >= parser->end)  {
        parser->error = "Premature end of quoted string";
        return -1;
    }
    if (*parser->cur != '"') {
        parser->error = "Invalid character in quoted string";
        return -1;
    }
    parser->cur++;
    if (value_r != NULL)
        *value_r = str_c(value);
    return 1;
}

static int
smtp_parser_skip_atom(struct smtp_parser *parser)
{
    /* Atom = 1*atext */

    if (parser->cur >= parser->end || !smtp_char_is_atext(*parser->cur))
        return 0;
    parser->cur++;

    while (parser->cur < parser->end && smtp_char_is_atext(*parser->cur))
        parser->cur++;
    return 1;
}

int smtp_parser_parse_atom(struct smtp_parser *parser,
    const char **value_r)
{
    const unsigned char *pbegin = parser->cur;
    int ret;

    if ((ret=smtp_parser_skip_atom(parser)) <= 0)
        return ret;

    if (value_r != NULL)
        *value_r = t_strndup(pbegin, parser->cur - pbegin);
    return 1;
}

int smtp_parser_parse_string(struct smtp_parser *parser,
    const char **value_r)
{
    int ret;

    /* String = Atom / Quoted-string */

    if ((ret=smtp_parser_parse_quoted_string(parser, value_r)) != 0)
        return ret;
    return smtp_parser_parse_atom(parser, value_r);
}

static bool
smtp_parse_xtext_hexdigit(const unsigned char digit,
    unsigned char *hexvalue)
{
    switch (digit) {
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
        *hexvalue = (*hexvalue) << 4;
        *hexvalue += digit - '0';
        break;
    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
        *hexvalue = (*hexvalue) << 4;
        *hexvalue += digit - 'A' + 10;
        break;
    default:
        return FALSE;
    }
    return TRUE;
}

int smtp_parser_parse_xtext(struct smtp_parser *parser,
    string_t *out)
{
    unsigned char hexchar;

    /* xtext   = *( xchar / hexchar )
       xchar   = any ASCII CHAR between "!" (33) and "~" (126) inclusive,
                  except for "+" and "=".
       hexchar = ASCII "+" immediately followed by two upper case
                 hexadecimal digits
     */
    if (parser->cur >= parser->end ||
        (!smtp_char_is_xtext(*parser->cur) && *parser->cur != '+'))
        return 0;

    while (parser->cur < parser->end) {
        const unsigned char *pbegin = parser->cur;

        while (parser->cur < parser->end &&
            smtp_char_is_xtext(*parser->cur))
            parser->cur++;

        if (out != NULL)
            str_append_n(out, pbegin, parser->cur - pbegin);

        if (parser->cur >= parser->end || *parser->cur != '+')
            break;
        parser->cur++;

        hexchar = 0;
        if (smtp_parse_xtext_hexdigit(*parser->cur, &hexchar)) {
            parser->cur++;
            if (smtp_parse_xtext_hexdigit(*parser->cur, &hexchar)) {
                parser->cur++;
                if (out != NULL)
                    str_append_c(out, hexchar);
                continue;
            }
        }

        parser->error = "Invalid hexchar after '+' in xtext";
        return -1;
    }

    return 1;
}