src/dom/uri.cpp

	uri.cpp revision 92eb663500564c06258bc74260952f9fe89258d1
/*
 * Phoebe DOM Implementation.
 *
 * This is a C++ approximation of the W3C DOM model, which follows
 * fairly closely the specifications in the various .idl files, copies of
 * which are provided for reference.  Most important is this one:
 *
 * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
 *
 * Authors:
 *   Bob Jamison
 *
 * Copyright (C) 2005-2008 Bob Jamison
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

#include "uri.h"
#include "ucd.h"

#include <cstdarg>
#include <cstdio>
#include <cstdlib>
#include <vector>

namespace org
{
namespace w3c
{
namespace dom
{


typedef struct
{
    int  ival;
    char const *sval;
    int  port;
} LookupEntry;

static LookupEntry schemes[] =
{
    { URI::SCHEME_DATA,   "data:",    0 },
    { URI::SCHEME_HTTP,   "http:",   80 },
    { URI::SCHEME_HTTPS,  "https:", 443 },
    { URI::SCHEME_FTP,    "ftp",     12 },
    { URI::SCHEME_FILE,   "file:",    0 },
    { URI::SCHEME_LDAP,   "ldap:",  123 },
    { URI::SCHEME_MAILTO, "mailto:", 25 },
    { URI::SCHEME_NEWS,   "news:",  117 },
    { URI::SCHEME_TELNET, "telnet:", 23 },
    { 0,                  NULL,       0 }
};


//#########################################################################
//# C O N S T R U C T O R
//#########################################################################

/**
 *
 */
URI::URI()
{
    init();
}

/**
 *
 */
URI::URI(const DOMString &str)
{
    init();
    parse(str);
}


/**
 *
 */
URI::URI(const char *str)
{
    init();
    DOMString domStr = str;
    parse(domStr);
}


/**
 *
 */
URI::URI(const URI &other)
{
    init();
    assign(other);
}


/**
 *
 */
URI &URI::operator=(const URI &other)
{
    init();
    assign(other);
    return *this;
}


/**
 *
 */
URI::~URI()
{
}


/**
 *
 */
void URI::init()
{
    parsebuf  = NULL;
    parselen  = 0;
    scheme    = SCHEME_NONE;
    schemeStr.clear();
    port      = 0;
    portSpecified = false;
    authority.clear();
    path.clear();
    absolute  = false;
    opaque    = false;
    query.clear();
    fragment.clear();
}


/**
 *
 */
void URI::assign(const URI &other)
{
    scheme    = other.scheme;
    schemeStr = other.schemeStr;
    authority = other.authority;
    port      = other.port;
    path      = other.path;
    absolute  = other.absolute;
    opaque    = other.opaque;
    query     = other.query;
    fragment  = other.fragment;
}


//#########################################################################
//#A T T R I B U T E S
//#########################################################################
static const char *hexChars = "0123456789abcdef";

static DOMString toStr(const std::vector<int> &arr)
{
    DOMString buf;
    std::vector<int>::const_iterator iter;
    for (iter=arr.begin() ; iter!=arr.end() ; ++iter)
        {
        int ch = *iter;
        if (isprint(ch))
            buf.push_back((XMLCh)ch);
        else
            {
            buf.push_back('%');
            int hi = ((ch>>4) & 0xf);
            buf.push_back(hexChars[hi]);
            int lo = ((ch   ) & 0xf);
            buf.push_back(hexChars[lo]);
            }
        }
    return buf;
}


DOMString URI::toString() const
{
    DOMString str = schemeStr;
    if (!authority.empty())
        {
        str.append("//");
        str.append(toStr(authority));
        }
    str.append(toStr(path));
    if (!query.empty())
        {
        str.append("?");
        str.append(toStr(query));
        }
    if (!fragment.empty())
        {
        str.append("#");
        str.append(toStr(fragment));
        }
    return str;
}


int URI::getScheme() const
{
    return scheme;
}

DOMString URI::getSchemeStr() const
{
    return schemeStr;
}


DOMString URI::getAuthority() const
{
    DOMString ret = toStr(authority);
    if (portSpecified && port>=0)
        {
        char buf[7];
        snprintf(buf, 6, ":%6d", port);
        ret.append(buf);
        }
    return ret;
}

DOMString URI::getHost() const
{
    DOMString str = toStr(authority);
    return str;
}

int URI::getPort() const
{
    return port;
}


DOMString URI::getPath() const
{
    DOMString str = toStr(path);
    return str;
}

DOMString URI::getNativePath() const
{
    DOMString pathStr = toStr(path);
    DOMString npath;
#ifdef __WIN32__
    unsigned int firstChar = 0;
    if (pathStr.size() >= 3)
        {
        if (pathStr[0] == '/' &&
            uni_is_letter(pathStr[1]) &&
            pathStr[2] == ':')
            firstChar++;
         }
    for (unsigned int i=firstChar ; i<pathStr.size() ; i++)
        {
        XMLCh ch = (XMLCh) pathStr[i];
        if (ch == '/')
            npath.push_back((XMLCh)'\\');
        else
            npath.push_back(ch);
        }
#else
    npath = pathStr;
#endif
    return npath;
}


bool URI::isAbsolute() const
{
    return absolute;
}

bool URI::isOpaque() const
{
    return opaque;
}


DOMString URI::getQuery() const
{
    DOMString str = toStr(query);
    return str;
}


DOMString URI::getFragment() const
{
    DOMString str = toStr(fragment);
    return str;
}


static int find(const std::vector<int> &str, int ch, int startpos)
{
    for (unsigned int i = startpos ; i < str.size() ; i++)
        {
        if (ch == str[i])
            return i;
        }
    return -1;
}


static int findLast(const std::vector<int> &str, int ch)
{
    /**
     * Fixed.  Originally I used an unsigned int for str.size(),
     * which was dumb, since i>=0 would always be true.
     */
    for (int i = ((int)str.size())-1 ; i>=0 ; i--)
        {
        if (ch == str[i])
            return i;
        }
    return -1;
}


static bool sequ(const std::vector<int> &str, const char *key)
{
    char *c = (char *)key;
    for (unsigned int i=0 ; i<str.size() ; i++)
        {
        if (! (*c))
            return false;
        if (*c != str[i])
            return false;
        }
    return true;
}


static std::vector<int> substr(const std::vector<int> &str,
                      int startpos, int len)
{
    std::vector<int> buf;
    unsigned int pos = startpos;
    for (int i=0 ; i<len ; i++)
        {
        if (pos >= str.size())
            break;
        buf.push_back(str[pos++]);
        }
    return buf;
}


URI URI::resolve(const URI &other) const
{
    //### According to w3c, this is handled in 3 cases

    //## 1
    if (opaque || other.isAbsolute())
        return other;

    //## 2
    if (!other.fragment.empty()     &&
        other.path.empty()          &&
        other.scheme == SCHEME_NONE &&
        other.authority.empty()     &&
        other.query.empty())
        {
        URI fragUri = *this;
        fragUri.fragment = other.fragment;
        return fragUri;
        }

    //## 3 http://www.ietf.org/rfc/rfc2396.txt, section 5.2
    URI newUri;
    //# 3.1
    newUri.scheme    = scheme;
    newUri.schemeStr = schemeStr;
    newUri.query     = other.query;
    newUri.fragment  = other.fragment;
    if (!other.authority.empty())
        {
        //# 3.2
        if (absolute || other.absolute)
            newUri.absolute = true;
        newUri.authority = other.authority;
        newUri.port      = other.port;//part of authority
        newUri.path      = other.path;
        }
    else
        {
        //# 3.3
        if (other.absolute)
            {
            newUri.absolute = true;
            newUri.path     = other.path;
            }
        else
            {
            int pos = findLast(path, '/');
            if (pos >= 0)
                {
                newUri.path.clear();
                //# append my path up to and including the '/'
                for (int i = 0; i<=pos ; i++)
                       newUri.path.push_back(path[i]);
                //# append other path
                for (unsigned int i = 0; i<other.path.size() ; i++)
                       newUri.path.push_back(other.path[i]);
                }
            else
                newUri.path = other.path;
            }
        }

    newUri.normalize();

    return newUri;
}


/**
 *  This follows the Java URI algorithm:
 *   1. All "." segments are removed.
 *   2. If a ".." segment is preceded by a non-".." segment
 *          then both of these segments are removed. This step
 *          is repeated until it is no longer applicable.
 *   3. If the path is relative, and if its first segment
 *          contains a colon character (':'), then a "." segment
 *          is prepended. This prevents a relative URI with a path
 *          such as "a:b/c/d" from later being re-parsed as an
 *          opaque URI with a scheme of "a" and a scheme-specific
 *          part of "b/c/d". (Deviation from RFC 2396)
 */
void URI::normalize()
{
    std::vector< std::vector<int> > segments;

    //## Collect segments
    if (path.size()<2)
        return;
    bool abs = false;
    int pos=0;
    int len = (int) path.size();

    if (path[0]=='/')
        {
        abs = true;
        pos++;
        }

    while (pos < len)
        {
        int pos2 = find(path, '/', pos);
        if (pos2 < 0)
            {
            std::vector<int> seg = substr(path, pos, path.size()-pos);
            //printf("last segment:%s\n", toStr(seg).c_str());
            segments.push_back(seg);
            break;
            }
        if (pos2>pos)
            {
            std::vector<int> seg = substr(path, pos, pos2-pos);
            //printf("segment:%s\n", toStr(seg).c_str());
            segments.push_back(seg);
            }
        pos = pos2;
        pos++;
        }

    //## Clean up (normalize) segments
    bool edited = false;
    std::vector< std::vector<int> >::iterator iter;
    for (iter=segments.begin() ; iter!=segments.end() ; )
        {
        std::vector<int> s = *iter;
        if (sequ(s,"."))
            {
            iter = segments.erase(iter);
            edited = true;
            }
        else if (sequ(s, "..") && iter != segments.begin() &&
                 !sequ(*(iter-1), ".."))
            {
            --iter; //back up, then erase two entries
            iter = segments.erase(iter);
            iter = segments.erase(iter);
            edited = true;
            }
        else
            ++iter;
        }

    //## Rebuild path, if necessary
    if (edited)
        {
        path.clear();
        if (abs)
            {
            path.push_back('/');
            }
        std::vector< std::vector<int> >::iterator iter;
        for (iter=segments.begin() ; iter!=segments.end() ; ++iter)
            {
            if (iter != segments.begin())
                path.push_back('/');
            std::vector<int> seg = *iter;
            for (unsigned int i = 0; i<seg.size() ; i++)
                path.push_back(seg[i]);
            }
        }

}


//#########################################################################
//# M E S S A G E S
//#########################################################################

void URI::error(const char *fmt, ...)
{
    va_list args;
    fprintf(stderr, "URI error: ");
    va_start(args, fmt);
    vfprintf(stderr, fmt, args);
    va_end(args);
    fprintf(stderr, "\n");
}

void URI::trace(const char *fmt, ...)
{
    va_list args;
    fprintf(stdout, "URI: ");
    va_start(args, fmt);
    vfprintf(stdout, fmt, args);
    va_end(args);
    fprintf(stdout, "\n");
}


//#########################################################################
//# P A R S I N G
//#########################################################################


int URI::peek(int p)
{
    if (p<0 || p>=parselen)
        return -1;
    return parsebuf[p];
}


int URI::match(int p0, char const *key)
{
    int p = p0;
    while (p < parselen)
        {
        if (*key == '\0')
            return p;
        else if (*key != parsebuf[p])
            break;
        p++; key++;
        }
    return p0;
}

//#########################################################################
//#  Parsing is performed according to:
//#  http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#components
//#########################################################################

int URI::parseHex(int p0, int &result)
{
    int p = p0;
    int val = 0;

    //# Upper 4
    int ch = peek(p);
    if (ch >= '0' && ch <= '9')
        val += (ch - '0');
    else if (ch >= 'a' && ch <= 'f')
        val += (10 + ch - 'a');
    else if (ch >= 'A' && ch <= 'F')
        val += (10 + ch - 'A');
    else
        {
        error("parseHex : unexpected character : %c", ch);
        return -1;
        }
    p++;
    val <<= 4;

    //# Lower 4
    ch = peek(p);
    if (ch >= '0' && ch <= '9')
        val += (ch - '0');
    else if (ch >= 'a' && ch <= 'f')
        val += (10 + ch - 'a');
    else if (ch >= 'A' && ch <= 'F')
        val += (10 + ch - 'A');
    else
        {
        error("parseHex : unexpected character : %c", ch);
        return -1;
        }
    p++;
    result = val;
    return p;
}


int URI::parseEntity(int p0, int &result)
{
    int p = p0;
    int ch = peek(p);
    if (ch != '&')
        return p0;
    p++;
    if (!match(p, "#x"))
        {
        error("parseEntity: expected '#x'");
        return -1;
        }
    p += 2;
    int val;
    p = parseHex(p, val);
    if (p<0)
        return -1;
    ch = peek(p);
    if (ch != ';')
        {
        error("parseEntity: expected ';'");
        return -1;
        }
    p++;
    result = val;
    return p;
}

int URI::parseAsciiEntity(int p0, int &result)
{
    int p = p0;
    int ch = peek(p);
    if (ch != '%')
        return p0;
    p++;
    int val;
    p = parseHex(p, val);
    if (p<0)
        return -1;
    result = val;
    return p;
}


int URI::parseScheme(int p0)
{
    int p = p0;
    for (LookupEntry *entry = schemes; entry->sval ; entry++)
        {
        int p2 = match(p, entry->sval);
        if (p2 > p)
            {
            schemeStr = entry->sval;
            scheme    = entry->ival;
            port      = entry->port;
            p = p2;
            return p;
            }
        }

    return p;
}


int URI::parseHierarchicalPart(int p0)
{
    int p = p0;
    int ch;

    //# Authority field (host and port, for example)
    int p2 = match(p, "//");
    if (p2 > p)
        {
        p = p2;
        portSpecified = false;
        DOMString portStr;
        while (p < parselen)
            {
            ch = peek(p);
            if (ch == '/')
                break;
            else if (ch == '&') //IRI entity
                {
                int val;
                p2 = parseEntity(p, val);
                if (p2<p)
                    {
                    return -1;
                    }
                p = p2;
                authority.push_back((XMLCh)val);
                }
            else if (ch == '%') //ascii hex excape
                {
                int val;
                p2 = parseAsciiEntity(p, val);
                if (p2<p)
                    {
                    return -1;
                    }
                p = p2;
                authority.push_back((XMLCh)val);
                }
            else if (ch == ':')
                {
                portSpecified = true;
                p++;
                }
            else if (portSpecified)
                {
                portStr.push_back((XMLCh)ch);
                p++;
                }
            else
                {
                authority.push_back((XMLCh)ch);
                p++;
                }
            }
        if (portStr.size() > 0)
            {
            char *pstr = (char *)portStr.c_str();
            char *endStr;
            long val = strtol(pstr, &endStr, 10);
            if (endStr > pstr) //successful parse?
                port = val;
            }
        }

    //# Are we absolute?
    ch = peek(p);
    if (uni_is_letter(ch) && peek(p+1)==':')
        {
        absolute = true;
        path.push_back((XMLCh)'/');
        }
    else if (ch == '/')
        {
        absolute = true;
        if (p>p0) //in other words, if '/' is not the first char
            opaque = true;
        path.push_back((XMLCh)ch);
        p++;
        }

    while (p < parselen)
        {
        ch = peek(p);
        if (ch == '?' || ch == '#')
            break;
        else if (ch == '&') //IRI entity
            {
            int val;
            p2 = parseEntity(p, val);
            if (p2<p)
                {
                return -1;
                }
            p = p2;
            path.push_back((XMLCh)val);
            }
        else if (ch == '%') //ascii hex excape
            {
            int val;
            p2 = parseAsciiEntity(p, val);
            if (p2<p)
                {
                return -1;
                }
            p = p2;
            path.push_back((XMLCh)val);
            }
        else
            {
            path.push_back((XMLCh)ch);
            p++;
            }
        }
    //trace("path:%s", toStr(path).c_str());
    return p;
}

int URI::parseQuery(int p0)
{
    int p = p0;
    int ch = peek(p);
    if (ch != '?')
        return p0;

    p++;
    while (p < parselen)
        {
        ch = peek(p);
        if (ch == '#')
            break;
        query.push_back((XMLCh)ch);
        p++;
        }


    return p;
}

int URI::parseFragment(int p0)
{

    int p = p0;
    int ch = peek(p);
    if (ch != '#')
        return p0;

    p++;
    while (p < parselen)
        {
        ch = peek(p);
        if (ch == '?')
            break;
        fragment.push_back(ch);
        p++;
        }


    return p;
}


int URI::parse(int p0)
{

    int p = p0;

    int p2 = parseScheme(p);
    if (p2 < 0)
        {
        error("Scheme");
        return -1;
        }
    p = p2;


    p2 = parseHierarchicalPart(p);
    if (p2 < 0)
        {
        error("Hierarchical part");
        return -1;
        }
    p = p2;

    p2 = parseQuery(p);
    if (p2 < 0)
        {
        error("Query");
        return -1;
        }
    p = p2;


    p2 = parseFragment(p);
    if (p2 < 0)
        {
        error("Fragment");
        return -1;
        }
    p = p2;

    return p;

}


bool URI::parse(const DOMString &str)
{

    parselen = str.size();
    parsebuf = new int[str.size()];
    if (!parsebuf)
        {
        error("parse : could not allocate parsebuf");
        return false;
        }

    DOMString::const_iterator iter;
    unsigned int i=0;
    for (iter= str.begin() ; iter!=str.end() ; ++iter)
        {
        int ch = *iter;
        if (ch == '\\')
            parsebuf[i++] = '/';
        else
            parsebuf[i++] = ch;
        }


    int p = parse(0);
    normalize();

    delete[] parsebuf;

    if (p < 0)
        {
        error("Syntax error");
        return false;
        }

    //printf("uri:%s\n", toString().c_str());
    //printf("parse:%s\n", toStr(path).c_str());

    return true;

}


}  //namespace dom
}  //namespace w3c
}  //namespace org
//#########################################################################
//# E N D    O F    F I L E
//#########################################################################