xmlreader.cpp revision 6cd2e86330e1049942b9ce57d4f10bbe2542067d
/*
* Phoebe DOM Implementation.
*
* This is a C++ approximation of the W3C DOM model, which follows
* fairly closely the specifications in the various .idl files, copies of
* which are provided for reference. Most important is this one:
*
* http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
*
* Authors:
* Bob Jamison
*
* Copyright (C) 2005-2008 Bob Jamison
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xmlreader.h"
#include "ucd.h"
#include "domimpl.h"
#include <stdio.h>
#include <stdarg.h>
namespace org
{
namespace w3c
{
namespace dom
{
//#########################################################################
//# E N T I T Y T A B L E
//#########################################################################
struct EntityInfo
{
const char *escape;
int escapeLength;
const char *value;
};
static EntityInfo entityTable[] =
{
{ "&amp;" , 5 , "&" },
{ "&lt;" , 4 , "<" },
{ "&gt;" , 4 , ">" },
{ "&apos;" , 6 , "'" },
{ "&quot;" , 6 , "\"" },
{ NULL , 0 , "\0" }
};
//#########################################################################
//# M E S S A G E S
//#########################################################################
/**
*
*/
void XmlReader::error(const char *fmt, ...)
{
va_list args;
fprintf(stderr, "XmlReader:error at line %d, column %d:", lineNr, colNr);
va_start(args, fmt);
vfprintf(stderr, fmt, args);
va_end(args) ;
fprintf(stderr, "\n");
}
//#########################################################################
//# U T I L I T Y
//#########################################################################
static void trim(DOMString &str)
{
int len = str.size();
if (len<1)
return;
int start = 0;
int end = 0;
for (start=0 ; start<len ; start++)
{
int ch = str[start];
if (ch<=' ' || ch>126)
break;
}
for (end=len-1 ; end>=0 ; end--)
{
int ch = str[end];
if (ch<=' ' || ch>126)
break;
}
if (start<end)
{
str = str.substr(start, end+1);
}
}
//#########################################################################
//# P A R S I N G
//#########################################################################
/**
* Get the character at the position and record the fact
*/
int XmlReader::get(int p)
{
if (p >= len)
return -1;
int ch = parsebuf[p];
//printf("%c", ch);
if (ch == '\n' || ch == '\r')
{
colNr = 0;
lineNr++;
}
else
colNr++;
return ch;
}
/**
* Look at the character at the position, but don't note the fact
*/
int XmlReader::peek(int p)
{
if (p >= len)
return -1;
int ch = parsebuf[p];
return ch;
}
/**
* Test if the given substring exists at the given position
* in parsebuf. Use peek() in case of out-of-bounds
*/
bool XmlReader::match(int pos, const char *str)
{
while (*str)
{
if (peek(pos++) != *str++)
return false;
}
return true;
}
/**
* Test if the given substring exists at the given position
* in a given buffer
*/
/*
static bool bufMatch(const DOMString &buf, int pos, char *str)
{
while (*str)
{
if (buf[pos++] != *str++)
return false;
}
return true;
}
*/
/**
*
*/
int XmlReader::skipwhite(int p)
{
while (p < len)
{
int b = get(p);
if (!uni_is_space(b))
break;
p++;
}
return p;
}
/**
* modify this to allow all chars for an element or attribute name
*/
int XmlReader::getWord(int p, DOMString &result)
{
while (p<len)
{
int b = get(p);
if (b<=' ' || b=='/' || b=='>' || b=='=')
break;
result.push_back((XMLCh)b);
p++;
}
return p;
}
/**
* get a name and prefix, if any
*/
int XmlReader::getPrefixedWord(int p, DOMString &prefix,
DOMString &shortWord, DOMString &fullWord)
{
while (p<len)
{
int b = get(p);
if (b<=' ' || b=='/' || b=='>' || b=='=')
break;
else if (b == ':')
{
prefix = shortWord;
shortWord = "";
}
else
shortWord.push_back((XMLCh)b);
p++;
}
if (prefix.size() > 0)
fullWord = prefix + ":" + shortWord;
else
fullWord = shortWord;
return p;
}
/**
* Assume that we are starting on a quote. Ends on the char
* after the final '"'
*/
int XmlReader::getQuoted(int p0, DOMString &result)
{
int p = p0;
if (peek(p)!='"' && peek(p)!='\'')
return p0;
int b = get(p++); //go to next char
DOMString buf;
while (p<len )
{
b = get(p++);
if (b=='"' || b=='\'')
break;
else if (b=='&')
{
p = parseEntity(p, result);
if (p < 0)
return p0;
}
else
{
buf.push_back((XMLCh)b);
}
}
//printf("quoted text:'%s'\n", buf.c_str());
result.append(buf);
return p;
}
/**
* Parse a <!xml> tag. Node may be null. Assumes current char is '<'
* ends on char after '>'
*/
int XmlReader::parseVersion(int p0)
{
int p = p0;
if (!match(p, "<?xml"))
return p0;
p += 5;
colNr += 5;
bool quickCloseDummy;
NodePtr node = new NodeImpl();
int p2 = parseAttributes(p, node, &quickCloseDummy);
if (p2 < p)
{
//smart ptr!!do not delete node;
return p0;
}
p = p2;
//get the attributes that we need
NamedNodeMap attributes = node->getAttributes();
NodePtr attr = attributes.getNamedItem("version");
if (attr.get())
document->setXmlVersion(attr->getNodeValue());
attr = attributes.getNamedItem("encoding");
if (attr.get())
{ /*document->setXmlEncoding(attr->getNodeValue());*/ }
attr = attributes.getNamedItem("standalone");
if (attr.get())
document->setXmlStandalone((attr->getNodeValue() == "yes"));
//#now we should be pointing at '?>'
if (!match(p, "?>"))
{
return p0;
}
//skip over '?>'
get(p++);
get(p++);
return p;
}
/**
* Parse a <!DOCTYPE> tag. doctype may be null. Expects '<'
* on start. Ends pointing at char after '>'
*/
int XmlReader::parseDoctype(int p0)
{
int p = p0;
if (!match(p, "<!DOCTYPE"))
return p0;
p += 9;
colNr += 9;
DocumentTypePtr doctype = document->getDoctype();
if (!doctype)
return p0;
//### get the root name of the document
p = skipwhite(p);
DOMString rootName;
int p2 = getWord(p, rootName);
if (p2 <= p)
return p0;
p = p2;
//printf("doctype root '%s'\n", rootName.c_str());
while (p < len)
{
p = skipwhite(p);
if (peek(p) == '>')
break;
else if (peek(p) == '[') //just ignore 'internal' [] stuff
{
while (p < len)
{
int ch = get(p++);
if (ch == ']')
break;
}
p++;
}
else if (match(p, "PUBLIC"))
{
p += 6;
colNr += 6;
p = skipwhite(p);
DOMString pubIdLiteral;
int p2 = getQuoted(p, pubIdLiteral);
if (p2 <= p)
return p0;
p = p2;
p = skipwhite(p);
DOMString systemLiteral;
p2 = getQuoted(p, systemLiteral);
if (p2 <= p)
return p0;
p = p2;
//printf("PUBLIC \"%s\" \"%s\" \n",
// pubIdLiteral.c_str(), systemLiteral.c_str());
}
else if (match(p, "SYSTEM"))
{
p += 6;
colNr += 6;
p = skipwhite(p);
DOMString systemLiteral;
int p2 = getQuoted(p, systemLiteral);
if (p2 <= p)
return p0;
p = p2;
//printf("SYSTEM \"%s\" \n", systemLiteral.c_str());
}
}
//skip over '>'
get(p++);
return p;
}
/**
* Expects '<' on startup, ends on char after '>'
*/
int XmlReader::parseComment(int p0, CommentPtr comment)
{
int p = p0;
if (!match(p, "<!--"))
return p0;
colNr += 4;
p += 4;
DOMString buf;
while (p<len-3)
{
if (match(p, "-->"))
{
p += 3;
colNr += 3;
break;
}
int ch = get(p++);
buf.push_back((XMLCh)ch);
}
comment->setNodeValue(buf);
return p;
}
/**
*
*/
int XmlReader::parseCDATA(int p0, CDATASectionPtr cdata)
{
int p = p0;
if (!match(p, "<![CDATA["))
return p0;
colNr += 9;
p += 9;
DOMString buf;
while (p<len)
{
if (match(p, "]]>"))
{
p +=3;
colNr += 3;
break;
}
int ch = get(p++);
buf.push_back((XMLCh)ch);
}
/*printf("Got CDATA:%s\n",buf.c_str());*/
cdata->setNodeValue(buf);
return p;
}
/**
*
*/
int XmlReader::parseText(int p0, TextPtr text)
{
int p = p0;
DOMString buf;
while (p<len)
{
if (peek(p) == '&')
{
p = parseEntity(p, buf);
if (p < 0) //error?
return p0;
}
else if (peek(p) == '<')
{
break;
}
else
{
int ch = get(p++);
buf.push_back((XMLCh)ch);
}
}
/*printf("Got Text:%s\n",buf.c_str());*/
text->setNodeValue(buf);
return p;
}
/**
* Parses attributes of a node. Should end pointing at either the
* '?' of a version or doctype tag, or a '>' of a normal tag
*/
int XmlReader::parseAttributes(int p0, NodePtr node, bool *quickClose)
{
*quickClose = false;
int p = p0;
NamedNodeMap attributes;
while (p<len)
{
/*printf("ch:%c\n",ch);*/
p = skipwhite(p);
int ch = get(p);
/*printf("ch:%c\n",ch);*/
if (ch == '?' || ch == '>')//done
break;
else if (ch=='/' && p<len+1)
{
p++;
p = skipwhite(p);
ch = peek(p);
if (ch == '>')
{
p++;
*quickClose = true;
/*printf("quick close\n");*/
return p;
}
}
DOMString shortName;
DOMString prefix;
DOMString qualifiedName;
int p2 = getPrefixedWord(p, prefix, shortName, qualifiedName);
if (p2 <= p)
break;
/*printf("name:%s",buf);*/
p = p2;
p = skipwhite(p);
ch = get(p);
/*printf("ch:%c\n",ch);*/
if (ch != '=')
break;
p++;
p = skipwhite(p);
/*ch = parsebuf[p];*/
/*printf("ch:%c\n",ch);*/
DOMString attrValue;
p2 = getQuoted(p, attrValue);
p = p2;
/*printf("name:'%s' value:'%s'\n",buf,buf2);*/
DOMString namespaceURI = "";
if (prefix == "xmlns" || shortName == "xmlns")
namespaceURI = XMLNSNAME;
//## Now let us make the attribute and give it to the node
AttrPtr attr = document->createAttributeNS(namespaceURI, qualifiedName);
attr->setValue(attrValue);
node->getAttributes().setNamedItemNS(attr);
}//while p<len
return p;
}
/**
* Appends the value of an entity to the buffer
*/
int XmlReader::parseEntity(int p0, DOMString &buf)
{
int p = p0;
for (EntityInfo *info = entityTable ; info->escape ; info++)
{
if (match(p, info->escape))
{
p += info->escapeLength;
colNr += info->escapeLength;
buf += info->value;
return p;
}
}
error("unterminated entity");
return -1;
}
//#########################################################################
//# P A R S E A N O D E
//#########################################################################
/**
* Parse as a document, preserving the original structure as much as
* possible
*/
int XmlReader::parseNode(int p0, NodePtr node, int depth)
{
int p = p0;
//### OPEN TAG
int ch = get(p++);
if (ch != '<')
return p0;
p = skipwhite(p);
DOMString openTagName;
DOMString openTagNamePrefix;
DOMString openTagQualifiedName;
int p2 = getPrefixedWord(p,openTagNamePrefix,
openTagName, openTagQualifiedName);
if (p2 <= p)
return p0;
p = p2;
p = skipwhite(p);
//printf("qualifiedName:%s\n", openTagQualifiedName.c_str());
DOMString namespaceURI = node->lookupNamespaceURI(openTagNamePrefix);
document->renameNode(node, namespaceURI, openTagQualifiedName);
//### ATTRIBUTES
bool quickClose;
p = parseAttributes(p, node, &quickClose);
if (quickClose) //trivial tag: <name/>
return p;
p++; //skip over '>'
DOMString nodeValue;
/* ### Get intervening data ### */
while (p<len && keepGoing)
{
//### COMMENT
if (match(p, "<!--"))
{
CommentPtr comment = document->createComment("");
p2 = parseComment(p, comment);
if (p2 <= p)
return p0;
p = p2;
if (parseAsData)
{ //throw away
//delete comment;
}
else
{
node->appendChild(comment);
}
}
//### VERSION
else if (match(p, "<?xml"))
{
p2 = parseVersion(p);
if (p2 <= p)
return p0;
}
//### DOCTYPE
else if (match(p, "<!DOCTYPE"))
{
p2 = parseDoctype(p);
if (p2 <= p)
return p0;
}
//### CDATA
else if (match(p, "<![CDATA["))
{
CDATASectionPtr cdata = document->createCDATASection("");
p2 = parseCDATA(p, cdata);
if (p2 <= p)
return p0;
p = p2;
if (parseAsData)
{
nodeValue += cdata->getNodeValue();
//delete cdata;
}
else
{
node->appendChild(cdata);
}
}
//### OPEN OR CLOSE TAG
else if (peek(p) == '<')
{
p2 = skipwhite(p+1);
if (peek(p2) =='/')
{
p = p2;
break;
}
else
{
/*Add element to tree*/
ElementPtr elem = document->createElement(""); //fill in name later
node->appendChild(elem);
p2 = parseNode(p, elem, depth+1);
if (p2 <= p)
{
/*printf("problem on element:%ls. p2:%d p:%d\n",n->name, p2, p);*/
return p0;
}
p = p2;
}
}
//### TEXT
else
{
TextPtr text = document->createTextNode("");
p2 = parseText(p, text);
if (p2 <= p)
return p0;
p = p2;
if (parseAsData)
{
nodeValue += text->getNodeValue();
//delete text;
}
else
{
node->appendChild(text);
}
}
}//while (p<len)
//printf("%d : nodeValue:'%s'\n", p, nodeValue.c_str());
trim(nodeValue);
node->setNodeValue(nodeValue);
//### get close tag. we should be pointing at '/'
p = skipwhite(p);
ch = get(p);
if (ch != '/')
{
error("no / on end tag");
return p0;
}
p++;
//### get word after '/'
p = skipwhite(p);
DOMString closeTagName;
DOMString closeTagNamePrefix;
DOMString closeTagQualifiedName;
p = getPrefixedWord(p, closeTagNamePrefix, closeTagName,
closeTagQualifiedName);
if (openTagQualifiedName != closeTagQualifiedName)
{
error("Mismatched closing tag. Expected </%s>. Got '%s'.",
openTagQualifiedName.c_str(), closeTagQualifiedName.c_str());
return p0;
}
p = skipwhite(p);
if (parsebuf[p] != '>')
{
error("no > on end tag");
return p0;
}
p++;
/*printf("close element:%ls\n",buf);*/
return p;
}
/**
*
*/
org::w3c::dom::DocumentPtr
XmlReader::parse(const DOMString &buf, int bufferOffset, int parseLen)
{
len = parseLen;
parsebuf = buf;
keepGoing = true;
DOMImplementationSourceImpl source;
DOMImplementation *domImpl = source.getDOMImplementation("");
document = domImpl->createDocument("", "", NULL);
//document = new svg::SVGDocumentImpl(domImpl, "", "", NULL);
int p = bufferOffset;
int p2 = 0;
while (p<len && keepGoing)
{
p = skipwhite(p);
//### COMMENT
if (match(p, "<!--"))
{
CommentPtr comment = document->createComment("");
p2 = parseComment(p, comment);
if (p2 <= p)
return document;
p = p2;
if (parseAsData)
{ //throw away
//delete comment;
}
else
{
document->appendChild(comment);
}
}
//### VERSION
else if (match(p, "<?xml"))
{
p2 = parseVersion(p);
if (p2 <= p)
return document;
p = p2;
}
//### DOCTYPE
else if (match(p, "<!DOCTYPE"))
{
p2 = parseDoctype(p);
if (p2 <= p)
return document;
p = p2;
}
else
{
break;
}
}
p = skipwhite(p);
p = parseNode(p, document->getDocumentElement(), 0);
keepGoing = false;
return document;
}
/**
*
*/
org::w3c::dom::DocumentPtr
XmlReader::parse(const DOMString &str)
{
DocumentPtr doc = parse(str, 0, str.size());
if (!doc)
return doc;
doc->normalizeDocument();
return doc;
}
/**
*
*/
org::w3c::dom::DocumentPtr
XmlReader::parseFile(const DOMString &fileName)
{
DocumentPtr doc;
DOMString buf = loadFile(fileName);
if (buf.size() == 0)
return doc; /*doc still null*/
doc = parse(buf, 0, buf.size());
return doc;
}
//#########################################################################
//# S T R E A M R E A D I N G
//#########################################################################
/**
*
*/
org::w3c::dom::DOMString
XmlReader::loadFile(const DOMString &fileName)
{
DOMString buf;
if (fileName.size() == 0)
return buf;
FILE *f = fopen(fileName.c_str(), "rb");
if (!f)
{
//error here
return buf;
}
while (!feof(f))
{
int ch = fgetc(f);
if (ch<0)
break;
buf.push_back((XMLCh)ch);
}
fclose(f);
return buf;
}
//#########################################################################
//# C O N S T R U C T O R / D E S T R U C T O R
//#########################################################################
/**
*
*/
XmlReader::XmlReader() :
document(),
parsebuf(),
keepGoing(false),
parseAsData(false),
pos(0),
len(0),
lineNr(1),
colNr(0)
{
}
/**
*
*/
XmlReader::XmlReader(bool parseAsDataArg) :
document(),
parsebuf(),
keepGoing(false),
parseAsData(parseAsDataArg),
pos(0),
len(0),
lineNr(1),
colNr(0)
{
}
/**
*
*/
XmlReader::~XmlReader()
{
}
} //namespace dom
} //namespace w3c
} //namespace org
//#########################################################################
//# E N D O F F I L E
//#########################################################################