src/lib-http/http-url.c

/* Copyright (c) 2013-2018 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "str.h"
#include "strfuncs.h"
#include "net.h"
#include "uri-util.h"

#include "http-url.h"
#include "http-request.h"

/*
 * HTTP URL parser
 */

struct http_url_parser {
    struct uri_parser parser;

    enum http_url_parse_flags flags;

    struct http_url *url;
    struct http_url *base;

    enum http_request_target_format req_format;

    bool relative:1;
    bool request_target:1;
};

static bool http_url_parse_authority(struct http_url_parser *url_parser)
{
    struct uri_parser *parser = &url_parser->parser;
    struct http_url *url = url_parser->url;
    struct uri_authority auth;
    const char *user = NULL, *password = NULL;
    int ret;

    if ((ret = uri_parse_host_authority(parser, &auth)) < 0)
        return FALSE;
    if (auth.host.name == NULL || *auth.host.name == '\0') {
        /* RFC 7230, Section 2.7.1: http URI Scheme

           A sender MUST NOT generate an "http" URI with an empty host
           identifier.  A recipient that processes such a URI reference MUST
           reject it as invalid.
         */
        parser->error = "HTTP URL does not allow empty host identifier";
        return FALSE;
    }
    if (ret > 0) {
        if (auth.enc_userinfo != NULL) {
            const char *p;

            if ((url_parser->flags & HTTP_URL_ALLOW_USERINFO_PART) == 0) {
                /* RFC 7230, Section 2.7.1: http URI Scheme

                   A sender MUST NOT generate the userinfo subcomponent (and its "@"
                   delimiter) when an "http" URI reference is generated within a
                   message as a request target or header field value.  Before making
                   use of an "http" URI reference received from an untrusted source,
                   a recipient SHOULD parse for userinfo and treat its presence as
                   an error; it is likely being used to obscure the authority for
                   the sake of phishing attacks.
                 */
                parser->error = "HTTP URL does not allow `userinfo@' part";
                return FALSE;
            }

            p = strchr(auth.enc_userinfo, ':');
            if (p == NULL) {
                if (!uri_data_decode(parser, auth.enc_userinfo, NULL, &user))
                    return FALSE;
            } else {
                if (!uri_data_decode(parser, auth.enc_userinfo, p, &user))
                    return FALSE;
                if (!uri_data_decode(parser, p+1, NULL, &password))
                    return FALSE;
            }
        }
    }
    if (url != NULL) {
        uri_host_copy(parser->pool, &url->host, &auth.host);
        url->port = auth.port;
        url->user = p_strdup(parser->pool, user);
        url->password = p_strdup(parser->pool, password);
    }
    return TRUE;
}

static bool http_url_parse_authority_form(struct http_url_parser *url_parser)
{
    struct uri_parser *parser = &url_parser->parser;

    if (!http_url_parse_authority(url_parser))
        return FALSE;
    if (parser->cur != parser->end)
        return FALSE;
    url_parser->req_format = HTTP_REQUEST_TARGET_FORMAT_AUTHORITY;
    return TRUE;
}

static bool http_url_do_parse(struct http_url_parser *url_parser)
{
    struct uri_parser *parser = &url_parser->parser;
    struct http_url *url = url_parser->url, *base = url_parser->base;
    const char *const *path;
    bool relative = TRUE, have_scheme = FALSE, have_authority = FALSE,
        have_path = FALSE;
    int path_relative;
    const char *part;
    int ret;

    /* RFC 7230, Appendix B:

       http-URI       = "http://" authority path-abempty [ "?" query ]
                        [ "#" fragment ]
       https-URI      = "https://" authority path-abempty [ "?" query ]
                        [ "#" fragment ]
       partial-URI    = relative-part [ "?" query ]

       request-target = origin-form / absolute-form / authority-form /
                        asterisk-form

       origin-form    = absolute-path [ "?" query ]
       absolute-form  = absolute-URI
       authority-form = authority
       asterisk-form  = "*"
                      ; Not parsed here

       absolute-path  = 1*( "/" segment )

       RFC 3986, Appendix A: (implemented in uri-util.h)

       absolute-URI   = scheme ":" hier-part [ "?" query ]

       hier-part      = "//" authority path-abempty
                      / path-absolute
                      / path-rootless
                      / path-empty

       relative-part  = "//" authority path-abempty
                      / path-absolute
                      / path-noscheme
                      / path-empty

       authority     = [ userinfo "@" ] host [ ":" port ]

       path-abempty   = *( "/" segment )
       path-absolute  = "/" [ segment-nz *( "/" segment ) ]
       path-noscheme  = segment-nz-nc *( "/" segment )
       path-rootless  = segment-nz *( "/" segment )
       path-empty     = 0<pchar>

       segment        = *pchar
       segment-nz     = 1*pchar
       segment-nz-nc  = 1*( unreserved / pct-encoded / sub-delims / "@" )
                    ; non-zero-length segment without any colon ":"

       query          = *( pchar / "/" / "?" )
       fragment       = *( pchar / "/" / "?" )
     */

    /* "http:" / "https:" */
    if ((url_parser->flags & HTTP_URL_PARSE_SCHEME_EXTERNAL) == 0) {
        const char *scheme;

        if ((ret = uri_parse_scheme(parser, &scheme)) <= 0) {
            parser->cur = parser->begin;
        } else {
            if (strcasecmp(scheme, "https") == 0) {
                if (url != NULL)
                    url->have_ssl = TRUE;
            } else if (strcasecmp(scheme, "http") != 0) {
                if (url_parser->request_target) {
                    /* valid as non-HTTP scheme, but also try to parse as authority */
                    parser->cur = parser->begin;
                    if (!http_url_parse_authority_form(url_parser)) {
                        url_parser->url = NULL; /* indicate non-http-url */
                        url_parser->req_format = HTTP_REQUEST_TARGET_FORMAT_ABSOLUTE;
                    }
                    return TRUE;
                }
                parser->error = "Not an HTTP URL";
                return FALSE;
            }
            relative = FALSE;
            have_scheme = TRUE;
        }
    } else {
        relative = FALSE;
        have_scheme = TRUE;
    }

    /* "//" authority   ; or
     * ["//"] authority ; when parsing a request target
     */
    if (parser->cur < parser->end && parser->cur[0] == '/') {
        if (parser->cur+1 < parser->end && parser->cur[1] == '/') {
            parser->cur += 2;
            relative = FALSE;
            have_authority = TRUE;
        } else {
            /* start of absolute-path */
        }
    } else if (url_parser->request_target && !have_scheme) {
        if (!http_url_parse_authority_form(url_parser)) {
            /* not non-HTTP scheme and invalid as authority-form */
            parser->error = "Request target is invalid";
            return FALSE;
        }
        return TRUE;
    }

    if (have_scheme && !have_authority) {
        parser->error = "Absolute HTTP URL requires `//' after `http:'";
        return FALSE;
    }

    if (have_authority) {
        if (!http_url_parse_authority(url_parser))
            return FALSE;
    }

    /* path-abempty / path-absolute / path-noscheme / path-empty */
    if ((ret = uri_parse_path(parser, &path_relative, &path)) < 0)
        return FALSE;

    /* Relative URLs are only valid when we have a base URL */
    if (relative) {
        if (base == NULL) {
            parser->error = "Relative HTTP URL not allowed";
            return FALSE;
        } else if (!have_authority && url != NULL) {
            uri_host_copy(parser->pool, &url->host, &base->host);
            url->port = base->port;
            url->have_ssl = base->have_ssl;
            url->user = p_strdup_empty(parser->pool, base->user);
            url->password = p_strdup_empty(parser->pool, base->password);
        }

        url_parser->relative = TRUE;
    }

    /* Resolve path */
    if (ret > 0) {
        string_t *fullpath = NULL;

        have_path = TRUE;

        if (url != NULL)
            fullpath = t_str_new(256);

        if (relative && path_relative > 0 && base->path != NULL) {
            const char *pbegin = base->path;
            const char *pend = base->path + strlen(base->path);
            const char *p = pend - 1;

            i_assert(*pbegin == '/');

            /* discard trailing segments of base path based on how many effective
               leading '..' segments were found in the relative path.
             */
            while (path_relative > 0 && p > pbegin) {
                while (p > pbegin && *p != '/') p--;
                if (p >= pbegin) {
                    pend = p;
                    path_relative--;
                }
                if (p > pbegin) p--;
            }

            if (url != NULL && pend > pbegin)
                str_append_n(fullpath, pbegin, pend-pbegin);
        }

        /* append relative path */
        while (*path != NULL) {
            if (!uri_data_decode(parser, *path, NULL, &part))
                return FALSE;

            if (url != NULL) {
                str_append_c(fullpath, '/');
                str_append(fullpath, part);
            }
            path++;
        }

        if (url != NULL)
            url->path = p_strdup(parser->pool, str_c(fullpath));
    } else if (relative && url != NULL) {
        url->path = p_strdup(parser->pool, base->path);
    }

    /* [ "?" query ] */
    if ((ret = uri_parse_query(parser, &part)) < 0)
        return FALSE;
    if (ret > 0) {
        if (url != NULL)
            url->enc_query = p_strdup(parser->pool, part);
    } else if (relative && !have_path && url != NULL) {
        url->enc_query = p_strdup(parser->pool, base->enc_query);
    }

    /* [ "#" fragment ] */
    if ((ret = uri_parse_fragment(parser, &part)) < 0)
        return FALSE;
    if (ret > 0) {
        if ((url_parser->flags & HTTP_URL_ALLOW_FRAGMENT_PART) == 0) {
            parser->error = "URL fragment not allowed for HTTP URL in this context";
            return FALSE;
        }
        if (url != NULL)
            url->enc_fragment =  p_strdup(parser->pool, part);
    } else if (relative && !have_path && url != NULL) {
        url->enc_fragment = p_strdup(parser->pool, base->enc_fragment);
    }

    /* must be at end of URL now */
    i_assert(parser->cur == parser->end);

    if (have_scheme)
        url_parser->req_format = HTTP_REQUEST_TARGET_FORMAT_ABSOLUTE;
    return TRUE;
}

/* Public API */

int http_url_parse(const char *url, struct http_url *base,
           enum http_url_parse_flags flags, pool_t pool,
           struct http_url **url_r, const char **error_r)
{
    struct http_url_parser url_parser;

    /* base != NULL indicates whether relative URLs are allowed. However, certain
       flags may also dictate whether relative URLs are allowed/required. */
    i_assert((flags & HTTP_URL_PARSE_SCHEME_EXTERNAL) == 0 || base == NULL);

    i_zero(&url_parser);
    uri_parser_init(&url_parser.parser, pool, url);
    url_parser.parser.allow_pct_nul = (flags & HTTP_URL_ALLOW_PCT_NUL) != 0;

    url_parser.url = p_new(pool, struct http_url, 1);
    url_parser.base = base;
    url_parser.flags = flags;

    if (!http_url_do_parse(&url_parser)) {
        *error_r = url_parser.parser.error;
        return -1;
    }
    *url_r = url_parser.url;
    return 0;
}

int http_url_request_target_parse(const char *request_target,
    const char *host_header, pool_t pool, struct http_request_target *target,
    const char **error_r)
{
    struct http_url_parser url_parser;
    struct uri_parser *parser;
    struct uri_authority auth;
    struct http_url base;

    i_zero(&url_parser);
    parser = &url_parser.parser;
    uri_parser_init(parser, pool, host_header);

    if (uri_parse_host_authority(parser, &auth) <= 0) {
        *error_r = t_strdup_printf("Invalid Host header: %s", parser->error);
        return -1;
    }

    if (parser->cur != parser->end || auth.enc_userinfo != NULL) {
        *error_r = "Invalid Host header: Contains invalid character";
        return -1;
    }

    if (request_target[0] == '*' && request_target[1] == '\0') {
        struct http_url *url = p_new(pool, struct http_url, 1);
        uri_host_copy(pool, &url->host, &auth.host);
        url->port = auth.port;
        target->url = url;
        target->format = HTTP_REQUEST_TARGET_FORMAT_ASTERISK;
        return 0;
    }

    i_zero(&base);
    base.host = auth.host;
    base.port = auth.port;

    i_zero(parser);
    uri_parser_init(parser, pool, request_target);

    url_parser.url = p_new(pool, struct http_url, 1);
    url_parser.request_target = TRUE;
    url_parser.req_format = HTTP_REQUEST_TARGET_FORMAT_ORIGIN;
    url_parser.base = &base;
    url_parser.flags = 0;

    if (!http_url_do_parse(&url_parser)) {
        *error_r = url_parser.parser.error;
        return -1;
    }

    target->url = url_parser.url;
    target->format = url_parser.req_format;
    return 0;
}

/*
 * HTTP URL manipulation
 */

void http_url_copy_authority(pool_t pool, struct http_url *dest,
    const struct http_url *src)
{
    i_zero(dest);
    uri_host_copy(pool, &dest->host, &src->host);
    dest->port = src->port;
    dest->have_ssl = src->have_ssl;
}

struct http_url *http_url_clone_authority(pool_t pool,
    const struct http_url *src)
{
    struct http_url *new_url;

    new_url = p_new(pool, struct http_url, 1);
    http_url_copy_authority(pool, new_url, src);

    return new_url;
}

void http_url_copy(pool_t pool, struct http_url *dest,
    const struct http_url *src)
{
    http_url_copy_authority(pool, dest, src);
    dest->path = p_strdup(pool, src->path);
    dest->enc_query = p_strdup(pool, src->enc_query);
    dest->enc_fragment = p_strdup(pool, src->enc_fragment);
}

void http_url_copy_with_userinfo(pool_t pool, struct http_url *dest,
    const struct http_url *src)
{
    http_url_copy(pool, dest, src);
    dest->user = p_strdup(pool, src->user);
    dest->password = p_strdup(pool, src->password);
}

struct http_url *http_url_clone(pool_t pool, const struct http_url *src)
{
    struct http_url *new_url;

    new_url = p_new(pool, struct http_url, 1);
    http_url_copy(pool, new_url, src);

    return new_url;
}

struct http_url *http_url_clone_with_userinfo(pool_t pool,
    const struct http_url *src)
{
    struct http_url *new_url;

    new_url = p_new(pool, struct http_url, 1);
    http_url_copy_with_userinfo(pool, new_url, src);

    return new_url;
}


/*
 * HTTP URL construction
 */

static void
http_url_add_scheme(string_t *urlstr, const struct http_url *url)
{
    /* scheme */
    if (!url->have_ssl)
        uri_append_scheme(urlstr, "http");
    else
        uri_append_scheme(urlstr, "https");
    str_append(urlstr, "//");
}

static void
http_url_add_authority(string_t *urlstr, const struct http_url *url)
{
    /* host */
    uri_append_host(urlstr, &url->host);
    /* port */
    uri_append_port(urlstr, url->port);
}

static void
http_url_add_target(string_t *urlstr, const struct http_url *url)
{
    if (url->path == NULL || *url->path == '\0') {
        /* Older syntax of RFC 2616 requires this slash at all times for an
             absolute URL
         */
        str_append_c(urlstr, '/');
    } else {
        uri_append_path_data(urlstr, "", url->path);
    }

    /* query (pre-encoded) */
    if (url->enc_query != NULL) {
        str_append_c(urlstr, '?');
        str_append(urlstr, url->enc_query);
    }
}

const char *http_url_create(const struct http_url *url)
{
    string_t *urlstr = t_str_new(512);

    http_url_add_scheme(urlstr, url);
    http_url_add_authority(urlstr, url);
    http_url_add_target(urlstr, url);

    /* fragment */
    if (url->enc_fragment != NULL) {
        str_append_c(urlstr, '#');
        str_append(urlstr, url->enc_fragment);
    }

    return str_c(urlstr);
}

const char *http_url_create_host(const struct http_url *url)
{
    string_t *urlstr = t_str_new(512);

    http_url_add_scheme(urlstr, url);
    http_url_add_authority(urlstr, url);

    return str_c(urlstr);
}

const char *http_url_create_authority(const struct http_url *url)
{
    string_t *urlstr = t_str_new(256);

    http_url_add_authority(urlstr, url);

    return str_c(urlstr);
}

const char *http_url_create_target(const struct http_url *url)
{
    string_t *urlstr = t_str_new(256);

    http_url_add_target(urlstr, url);

    return str_c(urlstr);
}

void http_url_escape_path(string_t *out, const char *data)
{
    uri_append_query_data(out, "&;?=+", data);
}

void http_url_escape_param(string_t *out, const char *data)
{
    uri_append_query_data(out, "&;/?=+", data);
}