xpathparser.cpp revision 6cd2e86330e1049942b9ce57d4f10bbe2542067d
/*
* Phoebe DOM Implementation.
*
* This is a C++ approximation of the W3C DOM model, which follows
* fairly closely the specifications in the various .idl files, copies of
* which are provided for reference. Most important is this one:
*
*
* Authors:
* Bob Jamison
*
* Copyright (C) 2006-2007 Bob Jamison
*
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "ucd.h"
#include "xpathparser.h"
namespace org
{
namespace w3c
{
namespace dom
{
namespace xpath
{
//#########################################################################
//# M E S S A G E S
//#########################################################################
{
if (!debug)
return;
fprintf(f, "XPathParser: ");
fprintf(f, "\n");
}
{
fprintf(f, "XPathParser ERROR: ");
fprintf(f, "\n");
//Print location in string
for (int i=0 ; i<position ; i++)
fprintf(f, " ");
fprintf(f, "^\n");
}
{
if (!debug)
return;
return;
for (int i=0 ; i<indent ; i++)
}
//#########################################################################
//# L E X I C A L S C A N N I N G
//#########################################################################
{
}
{
}
{
}
{
}
void XPathParser::lexicalTokenDump()
{
printf("####### LEXICAL TOKENS #######\n");
for (unsigned int i=0 ; i<lexicalTokens.size() ; i++)
{
printf("%d : ", i);
lexicalTokens[i].print();
}
printf("##### END LEXICAL TOKENS #####\n\n");
}
{
if (p < 0 || p>=(int)lexicalTokens.size())
{
return tok;
}
return lexicalTokens[p];
}
int XPathParser::lexTokType(int p)
{
if (p < 0 || p>=(int)lexicalTokens.size())
return -1;
return lexicalTokens[p].getType();
}
int XPathParser::peek(int p)
{
if (p >= parselen)
return -1;
position = p;
return parsebuf[p] ;
}
int XPathParser::get(int p)
{
if (p >= parselen)
return -1;
position = p;
return parsebuf[p];
}
{
int p = p0;
while (p < parselen)
{
if (!uni_is_space(ch))
break;
}
return p;
}
{
int p = p0;
while (p < parselen)
{
if (!uni_is_letter_or_digit(ch))
break;
}
return p;
}
{
while (*str)
{
if (p >= parselen)
return -1;
return -1;
p++; str++;
}
return p;
}
{
int p = p0;
if (p >= parselen)
return p0;/*need at least x*/
bool isdouble = false;
bool negative = false;
if (ch=='-')
{
p++;
negative = true;
}
bool seen_dot = false;
bool seen_e = false;
bool seen_eminus = false;
int i = p;
while (i < parselen)
{
if (ch=='.')
{
if (seen_dot)
return p0;
seen_dot = true;
isdouble = true;
}
{
return p0;
seen_e = true;
}
{
if (seen_eminus || !seen_dot)
return p0;
seen_eminus = true;
}
else if (!uni_is_digit(ch))
break;
i++;
}
if (i == p)/*no digits*/
return p0;
if (isdouble)
{
char *end;
if (!end)/*not a number?*/
{
return p0;
}
}
else
{
char *end;
if (!end)/*not a number?*/
{
return p0;
}
}
p = i;
return p;
}
{
int p = p0;
int quotechar = 0;
{
}
else
return p0;
p++;
while (true)
{
if (p >= parselen)
{
error("Unterminated literal string");
return -1;
}
break;
p++;
}
p++; //skip over closing "
return p;
}
/**
* NCName is a 'non-colonized' name
*/
{
int p = p0;
return p0;
p++;
while (p < parselen)
{
if ( uni_is_letter_or_digit(ch) ||
// isCombiningChar(ch) ||
// isExtender(ch) ||
{
p++;
}
else
break;
}
return p;
}
/**
* Name parsing with post-parsing
*/
{
int p = p0;
if (ch == '*')
{
p++;
return p;
}
if (p2 <= p)
return p0;
p = p2;
{
return p;
}
return p;
p++;
if (ch == '*')
{
p++;
return p;
}
if (p2 <= p)
{
return p0;
error("Nothing after ':' in QName");
return -1;
}
p = p2;
return p;
}
int XPathParser::lexicalScan()
{
int p = 0;
int p2 = p;
while (p < parselen)
{
p = p2;
//trace("nextChar:%c", peek(p));
bool selected = false;
//### LITERAL EXPR TOKENS
for (int i=2 ; i<=10 ; i++)
{
if (p2 > p)
{
p = p2;
selected = true;
break;
}
}
if (selected)
continue;
//### OPERATORS
{
if (p2 > p)
{
//according to the disambiguating rule for * in the spec
{
{
p = p2;
selected = true;
break;
}
}
else
{
p = p2;
selected = true;
break;
}
}
}
if (selected)
continue;
//### NODE TYPES
{
if (p2 > p)
{
p = p2;
selected = true;
break;
}
}
if (selected)
continue;
//### AXIS NAMES
{
if (p2 > p)
{
p = p2;
selected = true;
break;
}
}
if (selected)
continue;
//### NAME TEST
if (p2 > p)
{
else
p = p2;
selected = true;
}
if (selected)
continue;
//### VARIABLE REFERENCE
if (peek(p) == '$')
{
p++;
if (p2 > p)
{
p = p2;
selected = true;
}
else
{
error("Variable referenced with '$' requires a qualified name\n");
return -1;
}
}
if (selected)
continue;
//### NUMBER
double numval;
if (p2 > p)
{
p = p2;
selected = true;
}
if (selected)
continue;
//### LITERAL
if (p2 > p)
{
p = p2;
selected = true;
}
if (selected)
continue;
//### CHAR (default, none of the above)
p++;
}//while p
return p;
}
//#########################################################################
//# X P A T H G R A M M A R P A R S I N G
//#########################################################################
//## Various shorthand methods to add a token to the list
{
}
{
}
{
}
{
}
{
}
//########################################
//# Grammar - specific parsing
//########################################
/**
* [1] LocationPath ::=
* RelativeLocationPath
* | AbsoluteLocationPath
*/
{
int p = p0;
p = skipwhite(p);
if (p2 > p)
{
return p2;
}
if (p2 > p)
{
return p2;
}
return p0;
}
/**
* [2] AbsoluteLocationPath ::=
* '/' RelativeLocationPath?
* | AbbreviatedAbsoluteLocationPath
*/
{
int p = p0;
{
p++;
if (p2 <= p)
{
error("Relative path after '/'");
return -1;
}
p = p2;
return p;
}
//AbbreviatedAbsoluteLocationPath
{
p++;
if (p2 <= p)
{
error("Relative path after '//'");
return -1;
}
p = p2;
return p;
}
return p0;
}
/**
* [3] RelativeLocationPath ::=
* Step
* | RelativeLocationPath '/' Step
* | AbbreviatedRelativeLocationPath
*/
{
int p = p0;
if (p2 < 0)
return -1;
if (p2 > p)
{
p = p2;
{
p++;
if (p2 < 0)
{
error("Relative path after '/'");
return -1;
}
p = p2;
return p;
}
//AbbreviatedRelativeLocationPath
{
p++;
// a '//' is an abbreviation for /descendant-or-self:node()/
if (p2 < 0)
{
error("Relative path after '//'");
return -1;
}
p = p2;
return p;
}
return p;
}
return p0;
}
/**
* [4] Step ::=
* AxisSpecifier NodeTest Predicate*
* | AbbreviatedStep
*/
{
int p = p0;
//This can be (and usually is) 0-length
if (p2 < 0)
{
error("Axis specifier in step section");
return -1;
}
p = p2;
if (p2 < 0)
{
error("Node test in step section");
return -1;
}
if (p2 > p)
{
p = p2;
if (p2 < 0)
{
error("Predicate in step section");
return -1;
}
p = p2;
return p;
}
//AbbreviatedStep
if (lexTokType(p) == DOT)
{
p++;
return p;
}
//AbbreviatedStep
if (lexTokType(p) == DOUBLE_DOT)
{
p++;
return p;
}
return p0;
}
/**
* [5] AxisSpecifier ::=
* AxisName '::'
* | AbbreviatedAxisSpecifier
*/
{
int p = p0;
if (lexTokType(p) == AXIS_NAME)
{
int axisType = t.getIntValue();
p++;
if (lexTokType(p) != DOUBLE_COLON)
{
error("'::' required after axis name literal");
return -1;
}
p++;
switch (axisType)
{
case ANCESTOR_OR_SELF:
break;
case ANCESTOR:
break;
case ATTRIBUTE:
break;
case CHILD:
break;
case DESCENDANT_OR_SELF:
break;
case DESCENDANT:
break;
case FOLLOWING_SIBLING:
break;
case FOLLOWING:
break;
case NAMESPACE:
break;
case PARENT:
break;
case PRECEDING_SIBLING:
break;
case PRECEDING:
break;
case SELF:
break;
default:
{
return -1;
}
}
return p;
}
//AbbreviatedAxisSpecifier
if (lexTokType(p) == AMPR)
{
p++;
return p;
}
return p0;
}
/**
* [6] AxisName ::=
* 'ancestor'
* | 'ancestor-or-self'
* | 'attribute'
* | 'child'
* | 'descendant'
* | 'descendant-or-self'
* | 'following'
* | 'following-sibling'
* | 'namespace'
* | 'parent'
* | 'preceding'
* | 'preceding-sibling'
* | 'self'
* NOTE: This definition, and those at the bottom, is not
* needed. Its functionality is handled by lexical scanning.
* It is left here for reference.
*/
{
return p0;
}
/**
* [7] NodeTest ::=
* NameTest
* | NodeType '(' ')'
* | 'processing-instruction' '(' Literal ')'
*/
{
int p = p0;
{
p++;
return p;
}
{
if (t.getIntValue() == PROCESSING_INSTRUCTION)
{
if (lexTokType(p) != LPAREN ||
{
error("processing instruction requires (\"literal string\")");
return -1;
}
p += 3;
}
else
{
{
error("processing instruction requires ()");
return -1;
}
p += 2;
}
return p;
}
return p0;
}
/**
* [8] Predicate ::=
* '[' PredicateExpr ']'
*/
{
int p = p0;
if (lexTokType(p) != LBRACKET)
return p0;
p++;
if (p2 <= p)
{
error("Predicate expression in predicate");
return -1;
}
p = p2;
if (lexTokType(p) != RBRACKET)
{
error("Predicate expression requires closing ']'");
return -1;
}
p++;
return p;
}
/**
* [9] PredicateExpr ::=
* Expr
*/
{
int p = p0;
if (p2 < 0)
{
error("Expression in predicate expression");
return -1;
}
p = p2;
return p;
}
/**
* [10] AbbreviatedAbsoluteLocationPath ::=
* '//' RelativeLocationPath
* NOTE: not used. handled in getAbsoluteLocationPath()
*/
{
return p0;
}
/**
* [11] AbbreviatedRelativeLocationPath ::=
* RelativeLocationPath '//' Step
* NOTE: not used. handled in getRelativeLocationPath()
*/
{
return p0;
}
/**
* [12] AbbreviatedStep ::=
* '.'
* | '..'
* NOTE: not used. handled in getStep()
*/
{
return p0;
}
/**
* [13] AbbreviatedAxisSpecifier ::=
* '@'?
* NOTE: not used. handled in getAxisSpecifier()
*/
{
return p0;
}
/**
* [14] Expr ::=
* OrExpr
*/
{
int p = p0;
if (p2 < 0)
{
error("OR expression in expression");
return -1;
}
p = p2;
return p;
}
/**
* [15] PrimaryExpr ::=
* VariableReference
* | '(' Expr ')'
* | Literal
* | Number
* | FunctionCall
*/
{
int p = p0;
int p2 = p;
if (lexTokType(p) == VARIABLE_REFERENCE)
{
p++;
return p;
}
if (lexTokType(p) == LPAREN)
{
p++;
if (p2 <= p)
{
error("Expression in primary expression");
return -1;
}
p += p2;
if (lexTokType(p) != RPAREN)
{
error("Primary expression requires closing ')'");
return -1;
}
}
if (lexTokType(p) == LITERAL)
{
p++;
return p;
}
if (lexTokType(p) == NUMBER)
{
p++;
return p;
}
if (p2 < 0)
{
error("Function call in primary expression");
return -1;
}
if (p2 > p)
{
p = p2;
return p;
}
return p0;
}
/**
* [16] FunctionCall ::=
* FunctionName '(' ( Argument ( ',' Argument )* )? ')'
*/
{
int p = p0;
if (lexTokType(p) != FUNCTION_NAME)
return p0;
p++;
return p0;
p++;
int argCount = 0;
if (p2 < 0)
{
error("Error in function argument");
return -1;
}
if (p2 > p)
{
argCount++;
p = p2;
while (lexTokType(p) == COMMA)
{
p++;
if (p2 <= p)
{
error("Error in function argument");
return -1;
}
if (p2 > p)
argCount++;
//do we add a token here? i dont think so
p = p2;
}
}
{
error("Function requires closing ')'");
return -1;
}
p++;
// Function names from http://www.w3.org/TR/xpath#NT-FunctionName
if (name == "last")
else if (name == "position")
else if (name == "count")
else if (name == "id")
else if (name == "local-name")
else if (name == "namespace-uri")
else if (name == "name")
else if (name == "string")
else if (name == "concat")
else if (name == "starts-with")
else if (name == "contains")
else if (name == "substring-before")
else if (name == "substring-after")
else if (name == "substring")
else if (name == "string-length")
else if (name == "normalize-space")
else if (name == "translate")
else if (name == "boolean")
else if (name == "not")
else if (name == "true")
else if (name == "false")
else if (name == "lang")
else if (name == "number")
else if (name == "sum")
else if (name == "floor")
else if (name == "ceiling")
else if (name == "round")
else
{
return -1;
}
return p;
}
/**
* [17] Argument ::=
* Expr
*/
{
int p = p0;
if (p2 < 0)
{
error("Argument expression");
return -1;
}
p = p2;
return p;
}
/**
* [18] UnionExpr ::=
* PathExpr
* | UnionExpr '|' PathExpr
*/
{
int p = p0;
if (p2 < 0)
{
error("Path expression for union");
return -1;
}
p = p2;
{
p++;
if (p2 < 0)
{
error("OR (|) requires union expression on the left");
return -1;
}
p = p2;
}
return p;
}
/**
* [19] PathExpr ::=
* LocationPath
* | FilterExpr
* | FilterExpr '/' RelativeLocationPath
* | FilterExpr '//' RelativeLocationPath
*/
{
int p = p0;
int p2;
if (p2 < 0)
{
error("Location path in path expression");
return -1;
}
if (p2 > p)
{
p = p2;
return p;
}
if (p2 < 0)
{
error("Filter expression in path expression");
return -1;
}
if (p2 <= p)
return p0;
p = p2;
{
p++;
if (p2 < 0)
{
error("Relative location after / in path expression");
return -1;
}
p = p2;
return p;
}
{
p++;
if (p2 < 0)
{
error("Relative location after // in path expression");
return -1;
}
p = p2;
return p;
}
return p;
}
/**
* [20] FilterExpr ::=
* PrimaryExpr
* | FilterExpr Predicate
*/
{
int p = p0;
if (p2 < 0)
{
error("Primary expression in path expression");
return -1;
}
if (p2 > p)
{
p = p2;
while (true)
{
if (p2 < 0)
{
error("Predicate in primary expression");
return -1;
}
if (p2 > p)
{
p = p2;
}
else
break;
}
return p;
}
return p0;
}
/**
* [21] OrExpr ::=
* AndExpr
* | OrExpr 'or' AndExpr
*/
{
int p = p0;
if (p2 < 0)
{
error("AND expression in OR expression");
return -1;
}
if (p2 > p)
{
p = p2;
{
p++;
if (p2 <= p)
{
error("AND expression in OR expression");
return -1;
}
p = p2;
return p;
}
return p;
}
return p0;
}
/**
* [22] AndExpr ::=
* EqualityExpr
* | AndExpr 'and' EqualityExpr
*/
{
int p = p0;
if (p2 < 0)
{
error("Equality expression in AND expression");
return -1;
}
if (p2 > p)
{
p = p2;
{
p++;
if (p2 <= p)
{
error("AND expression after 'and'");
return -1;
}
p = p2;
return p;
}
return p;
}
return p0;
}
/**
* [23] EqualityExpr ::=
* RelationalExpr
* | EqualityExpr '=' RelationalExpr
* | EqualityExpr '!=' RelationalExpr
*/
{
int p = p0;
if (p2 < 0)
{
error("Relation expression in equality expression");
return -1;
}
if (p2 > p)
{
p = p2;
{
p++;
if (p2 <= p)
{
error("Equality expression expected after ==");
return -1;
}
p = p2;
return p;
}
{
p++;
if (p2 <= p)
{
error("Equality expression expected after !=");
return -1;
}
p = p2;
return p;
}
return p;
}
return p0;
}
/**
* [24] RelationalExpr ::=
* AdditiveExpr
* | RelationalExpr '<' AdditiveExpr
* | RelationalExpr '>' AdditiveExpr
* | RelationalExpr '<=' AdditiveExpr
* | RelationalExpr '>=' AdditiveExpr
*/
{
int p = p0;
if (p2 < 0)
{
error("Additive expression in relational expression");
return -1;
}
if (p2 > p)
{
p = p2;
{
p++;
if (p2 <= p)
{
error("Relational expression after '>'");
return -1;
}
p = p2;
return p;
}
{
p++;
if (p2 <= p)
{
error("Relational expression after '<'");
return -1;
}
p = p2;
return p;
}
{
p++;
if (p2 <= p)
{
error("Relational expression after '>='");
return -1;
}
p = p2;
return p;
}
{
p++;
if (p2 <= p)
{
error("Relational expression after '<='");
return -1;
}
p = p2;
return p;
}
return p;
}
return p0;
}
/**
* [25] AdditiveExp ::=
* MultiplicativeExpr
* | AdditiveExpr '+' MultiplicativeExpr
* | AdditiveExpr '-' MultiplicativeExpr
*/
{
int p = p0;
if (p2 < 0)
{
error("Multiplicative expression in additive expression");
return -1;
}
if (p2 > p)
{
p = p2;
{
p++;
if (p2 <= p)
{
error("Additive expression after '+'");
return -1;
}
p = p2;
return p;
}
{
p++;
if (p2 <= p)
{
error("Additive expression after '-'");
return -1;
}
p = p2;
return p;
}
return p;
}
return p0;
}
/**
* [26] MultiplicativeExpr ::=
* UnaryExpr
* | MultiplicativeExpr MultiplyOperator UnaryExpr
* | MultiplicativeExpr 'div' UnaryExpr
* | MultiplicativeExpr 'mod' UnaryExpr
*/
{
int p = p0;
if (p2 < 0)
{
error("Unary expression in multiplicative expression");
return -1;
}
if (p2 > p)
{
p = p2;
{
p++;
if (p2 <= p)
{
error("Multiplicative expression after '*'");
return -1;
}
p = p2;
return p;
}
{
p++;
if (p2 <= p)
{
error("Multiplicative expression after 'div'");
return -1;
}
p = p2;
return p;
}
{
p++;
if (p2 <= p)
{
error("Multiplicative expression after 'mod'");
return -1;
}
p = p2;
return p;
}
return p;
}
return p0;
}
/**
* [27] UnaryExpr ::=
* UnionExpr
* | '-' UnaryExpr
*/
{
int p = p0;
if (p2 < 0)
{
error("Union expression in unary expression");
return -1;
}
if (p2 > p)
{
p = p2;
return p;
}
if (lexTokType(p) == '-')
{
p++;
if (p2 < 0)
{
error("Unary expression after '-'");
return -1;
}
p = p2;
return p;
}
return p0;
}
//######################################################
//# NOT USED!!!
//## The grammar definitions below are
//## handled by lexical parsing, and will not be used
//######################################################
/**
* [28] ExprToken ::=
* '(' | ')' | '[' | ']' | '.' | '..' | '@' | ',' | '::'
* | NameTest
* | NodeType
* | Operator
* | FunctionName
* | AxisName
* | Literal
* | Number
* | VariableReference
*/
{
return p0;
}
/**
* [29] Literal ::=
* '"' [^"]* '"'
* | "'" [^']* "'"
*/
{
return p0;
}
/**
* [30] Number ::=
* Digits ('.' Digits?)?
* | '.' Digits
*/
{
return p0;
}
/**
* [31] Digits ::=
* [0-9]+
*/
{
return p0;
}
/**
* [32] Operator ::=
* OperatorName
* | MultiplyOperator
* | '/' | '//' | '|' | '+' | '-' | '='
* | '!=' | '<' | '<=' | '>' | '>='
*/
{
return p0;
}
/**
* [33] OperatorName ::=
* 'and' | 'or' | 'mod' | 'div'
*/
{
return p0;
}
/**
* [34] MultiplyOperator ::=
* '*'
*/
{
return p0;
}
/**
* [35] FunctionName ::=
* QName - NodeType
*/
{
return p0;
}
/**
* [36] VariableReference ::=
* '$' QName
*/
{
return p0;
}
/**
* [37] NameTest ::=
* '*'
* | NCName ':' '*'
* | QName
*/
{
return p0;
}
/**
* [38] NodeType ::=
* 'comment'
* | 'text'
* | 'processing-instruction'
* | 'node'
*/
{
return p0;
}
/**
* [39] ExprWhitespace ::=
* S
*/
{
return p0;
}
//#########################################################################
//# H I G H L E V E L P A R S I N G
//#########################################################################
/**
* Parse a candidate XPath string. Leave a copy in 'tokens.'
*/
{
int p0 = 0;
position = 0;
lexicalScan();
int p = getLocationPath(p0, 0);
parselen = 0;
if (p <= p0)
{
//return false;
}
return true;
}
//#########################################################################
//# E V A L U A T E
//#########################################################################
/**
* This wraps the two-step call to parse(), then execute() to get a NodeList
* of matching DOM nodes
*/
const DOMString &xpathString)
{
//### Maybe do caching for speed here
//### Parse and execute
//### Error message can be generated as a side effect
if (!parse(xpathString))
return list;
if (debug)
//### Execute the token list
{
//error
}
return results;
}
} // namespace xpath
} // namespace dom
} // namespace w3c
} // namespace org
//#########################################################################
//# E N D O F F I L E
//#########################################################################