parseCommon.cxx revision 7c478bd95313f5f23a4c958a745db2134aa03244
// Copyright (c) 1994 James Clark
// See the file COPYING for copying permission.
#pragma ident "%Z%%M% %I% %E% SMI"
#include "splib.h"
#include "Parser.h"
#include "token.h"
#include "MessageArg.h"
#include "ParserMessages.h"
#include "constant.h"
#include "NumericCharRefOrigin.h"
#include "macros.h"
#ifdef SP_NAMESPACE
namespace SP_NAMESPACE {
#endif
Boolean Parser::parseProcessingInstruction()
{
currentInput()->startToken();
Location location(currentLocation());
StringC buf;
for (;;) {
Token token = getToken(piMode);
if (token == tokenPic)
break;
switch (token) {
case tokenEe:
message(ParserMessages::processingInstructionEntityEnd);
return 0;
case tokenUnrecognized:
reportNonSgmlCharacter();
// fall through
case tokenChar:
buf += *currentInput()->currentTokenStart();
if (buf.size()/2 > syntax().pilen()) {
message(ParserMessages::processingInstructionLength,
NumberMessageArg(syntax().pilen()));
message(ParserMessages::processingInstructionClose);
return 0;
}
break;
}
}
if (buf.size() > syntax().pilen())
message(ParserMessages::processingInstructionLength,
NumberMessageArg(syntax().pilen()));
if (options().warnPiMissingName) {
size_t i = 0;
if (buf.size() && syntax().isNameStartCharacter(buf[0])) {
for (i = 1; i < buf.size(); i++)
if (!syntax().isNameCharacter(buf[i]))
break;
}
if (i == 0 || (i < buf.size() && !syntax().isS(buf[i])))
message(ParserMessages::piMissingName);
}
noteMarkup();
eventHandler().pi(new (eventAllocator()) ImmediatePiEvent(buf, location));
return 1;
}
Boolean Parser::parseLiteral(Mode litMode,
Mode liteMode,
size_t maxLength,
const MessageType1 &tooLongMessage,
unsigned flags,
Text &text)
{
unsigned startLevel = inputLevel();
Mode currentMode = litMode;
// If the literal gets to be longer than this, then we assume
// that the closing delimiter has been omitted if we're at the end
// of a line and at the starting input level.
size_t reallyMaxLength = (maxLength > size_t(-1)/2
? size_t(-1)
: maxLength * 2);
text.clear();
Location startLoc(currentLocation());
if (flags & literalDelimInfo)
text.addStartDelim(currentLocation());
for (;;) {
Token token = getToken(currentMode);
switch (token) {
case tokenEe:
if (inputLevel() == startLevel) {
message(ParserMessages::literalLevel);
return 0;
}
text.addEntityEnd(currentLocation());
popInputStack();
if (inputLevel() == startLevel)
currentMode = litMode;
break;
case tokenUnrecognized:
if (reportNonSgmlCharacter())
break;
message(ParserMessages::literalMinimumData,
StringMessageArg(currentToken()));
break;
case tokenRs:
text.ignoreChar(currentChar(), currentLocation());
break;
case tokenRe:
if (text.size() > reallyMaxLength && inputLevel() == startLevel) {
#if 0
message(tooLongMessage, NumberMessageArg(maxLength));
#endif
// guess that the closing delimiter has been omitted
Messenger::setNextLocation(startLoc);
message(ParserMessages::literalClosingDelimiter);
return 0;
}
// fall through
case tokenSepchar:
if ((flags & literalSingleSpace)
&& (text.size() == 0 || text.lastChar() == syntax().space()))
text.ignoreChar(currentChar(), currentLocation());
else
text.addChar(syntax().space(),
Location(new ReplacementOrigin(currentLocation(),
currentChar()),
0));
break;
case tokenSpace:
if ((flags & literalSingleSpace)
&& (text.size() == 0 || text.lastChar() == syntax().space()))
text.ignoreChar(currentChar(), currentLocation());
else
text.addChar(currentChar(), currentLocation());
break;
case tokenCroDigit:
case tokenHcroHexDigit:
{
Char c;
Location loc;
if (!parseNumericCharRef(token== tokenHcroHexDigit, c, loc))
return 0;
Boolean isSgmlChar;
if (!translateNumericCharRef(c, isSgmlChar))
break;
if (!isSgmlChar) {
if (flags & literalNonSgml)
text.addNonSgmlChar(c, loc);
else
message(ParserMessages::numericCharRefLiteralNonSgml,
NumberMessageArg(c));
break;
}
if (flags & literalDataTag) {
if (!syntax().isSgmlChar(c))
message(ParserMessages::dataTagPatternNonSgml);
else if (syntax().charSet(Syntax::functionChar)->contains(c))
message(ParserMessages::dataTagPatternFunction);
}
if ((flags & literalSingleSpace)
&& c == syntax().space()
&& (text.size() == 0 || text.lastChar() == syntax().space()))
text.ignoreChar(c, loc);
else
text.addChar(c, loc);
}
break;
case tokenCroNameStart:
if (!parseNamedCharRef())
return 0;
break;
case tokenEroGrpo:
message(inInstance() ? ParserMessages::eroGrpoStartTag : ParserMessages::eroGrpoProlog);
break;
case tokenLit:
case tokenLita:
if (flags & literalDelimInfo)
text.addEndDelim(currentLocation(), token == tokenLita);
goto done;
case tokenPeroNameStart:
if (options().warnInternalSubsetLiteralParamEntityRef
&& inputLevel() == 1)
message(ParserMessages::internalSubsetLiteralParamEntityRef);
// fall through
case tokenEroNameStart:
{
ConstPtr<Entity> entity;
Ptr<EntityOrigin> origin;
if (!parseEntityReference(token == tokenPeroNameStart,
(flags & literalNoProcess) ? 2 : 0,
entity, origin))
return 0;
if (!entity.isNull())
entity->litReference(text, *this, origin,
(flags & literalSingleSpace) != 0);
if (inputLevel() > startLevel)
currentMode = liteMode;
}
break;
case tokenPeroGrpo:
message(ParserMessages::peroGrpoProlog);
break;
case tokenCharDelim:
message(ParserMessages::dataCharDelim,
StringMessageArg(StringC(currentInput()->currentTokenStart(),
currentInput()->currentTokenLength())));
// fall through
case tokenChar:
if (text.size() > reallyMaxLength && inputLevel() == startLevel
&& currentChar() == syntax().standardFunction(Syntax::fRE)) {
#if 0
message(tooLongMessage, NumberMessageArg(maxLength));
#endif
// guess that the closing delimiter has been omitted
Messenger::setNextLocation(startLoc);
message(ParserMessages::literalClosingDelimiter);
return 0;
}
text.addChar(currentChar(), currentLocation());
break;
}
}
done:
if ((flags & literalSingleSpace)
&& text.size() > 0
&& text.lastChar() == syntax().space())
text.ignoreLastChar();
if (text.size() > maxLength) {
switch (litMode) {
case alitMode:
case alitaMode:
case talitMode:
case talitaMode:
if (AttributeValue::handleAsUnterminated(text, *this))
return 0;
default:
break;
}
message(tooLongMessage, NumberMessageArg(maxLength));
}
return 1;
}
Boolean Parser::parseNamedCharRef()
{
if (options().warnNamedCharRef)
message(ParserMessages::namedCharRef);
InputSource *in = currentInput();
Index startIndex = currentLocation().index();
in->discardInitial();
extendNameToken(syntax().namelen(), ParserMessages::nameLength);
Char c;
Boolean valid;
StringC name;
getCurrentToken(syntax().generalSubstTable(), name);
if (!syntax().lookupFunctionChar(name, &c)) {
message(ParserMessages::functionName, StringMessageArg(name));
valid = 0;
}
else {
valid = 1;
if (wantMarkup())
getCurrentToken(name); // the original name
}
NamedCharRef::RefEndType refEndType;
switch (getToken(refMode)) {
case tokenRefc:
refEndType = NamedCharRef::endRefc;
break;
case tokenRe:
refEndType = NamedCharRef::endRE;
if (options().warnRefc)
message(ParserMessages::refc);
break;
default:
refEndType = NamedCharRef::endOmitted;
if (options().warnRefc)
message(ParserMessages::refc);
break;
}
in->startToken();
if (valid)
in->pushCharRef(c, NamedCharRef(startIndex, refEndType, name));
return 1;
}
Boolean Parser::parseNumericCharRef(Boolean isHex, Char &ch, Location &loc)
{
InputSource *in = currentInput();
Location startLocation = currentLocation();
in->discardInitial();
Boolean valid = 1;
Char c = 0;
if (isHex) {
extendHexNumber();
const Char *lim = in->currentTokenEnd();
for (const Char *p = in->currentTokenStart(); p < lim; p++) {
int val = sd().hexDigitWeight(*p);
if (c <= charMax/16 && (c *= 16) <= charMax - val)
c += val;
else {
message(ParserMessages::characterNumber, StringMessageArg(currentToken()));
valid = 0;
break;
}
}
}
else {
extendNumber(syntax().namelen(), ParserMessages::numberLength);
const Char *lim = in->currentTokenEnd();
for (const Char *p = in->currentTokenStart(); p < lim; p++) {
int val = sd().digitWeight(*p);
if (c <= charMax/10 && (c *= 10) <= charMax - val)
c += val;
else {
message(ParserMessages::characterNumber, StringMessageArg(currentToken()));
valid = 0;
break;
}
}
}
if (valid && !sd().docCharsetDecl().charDeclared(c)) {
valid = 0;
message(ParserMessages::characterNumber, StringMessageArg(currentToken()));
}
Owner<Markup> markupPtr;
if (wantMarkup()) {
markupPtr = new Markup;
markupPtr->addDelim(isHex ? Syntax::dHCRO : Syntax::dCRO);
markupPtr->addNumber(in);
switch (getToken(refMode)) {
case tokenRefc:
markupPtr->addDelim(Syntax::dREFC);
break;
case tokenRe:
markupPtr->addRefEndRe();
if (options().warnRefc)
message(ParserMessages::refc);
break;
default:
if (options().warnRefc)
message(ParserMessages::refc);
break;
}
}
else if (options().warnRefc) {
if (getToken(refMode) != tokenRefc)
message(ParserMessages::refc);
}
else
(void)getToken(refMode);
if (valid) {
ch = c;
loc = Location(new NumericCharRefOrigin(startLocation,
currentLocation().index()
+ currentInput()->currentTokenLength()
- startLocation.index(),
markupPtr),
0);
}
return valid;
}
// Translate a character number in the document character set
// into the internal character set.
// If it's a non-SGML char (ie described as UNUSED in SGML declaration),
// return 1 and set sgmlChar to 0.
Boolean Parser::translateNumericCharRef(Char &ch, Boolean &isSgmlChar)
{
if (sd().internalCharsetIsDocCharset()) {
if (options().warnNonSgmlCharRef && !syntax().isSgmlChar(ch))
message(ParserMessages::nonSgmlCharRef);
isSgmlChar = 1;
return 1;
}
UnivChar univChar;
if (!sd().docCharset().descToUniv(ch, univChar)) {
const PublicId *pubid;
CharsetDeclRange::Type type;
Number n;
StringC desc;
if (sd().docCharsetDecl().getCharInfo(ch, pubid, type, n, desc)) {
if (type == CharsetDeclRange::unused) {
if (options().warnNonSgmlCharRef)
message(ParserMessages::nonSgmlCharRef);
isSgmlChar = 0;
return 1;
}
}
else
CANNOT_HAPPEN();
if (type == CharsetDeclRange::string)
message(ParserMessages::numericCharRefUnknownDesc,
NumberMessageArg(ch),
StringMessageArg(desc));
else
message(ParserMessages::numericCharRefUnknownBase,
NumberMessageArg(ch),
NumberMessageArg(n),
StringMessageArg(pubid->string()));
}
else {
WideChar resultChar;
ISet<WideChar> resultChars;
switch (sd().internalCharset().univToDesc(univChar,
resultChar,
resultChars)) {
case 1:
if (resultChar <= charMax) {
isSgmlChar = 1;
ch = Char(resultChar);
return 1;
}
// fall through
case 2:
message(ParserMessages::numericCharRefBadInternal,
NumberMessageArg(ch));
break;
default:
message(ParserMessages::numericCharRefNoInternal,
NumberMessageArg(ch));
break;
}
}
return 0;
}
// ignoreLevel: 0 means don't ignore;
// 1 means parse name group and ignore if inactive
// 2 means ignore
Boolean Parser::parseEntityReference(Boolean isParameter,
int ignoreLevel,
ConstPtr<Entity> &entity,
Ptr<EntityOrigin> &origin)
{
InputSource *in = currentInput();
Location startLocation(in->currentLocation());
Owner<Markup> markupPtr;
if (wantMarkup()) {
markupPtr = new Markup;
markupPtr->addDelim(isParameter ? Syntax::dPERO : Syntax::dERO);
}
if (ignoreLevel == 1) {
Markup savedMarkup;
Markup *savedCurrentMarkup = currentMarkup();
if (savedCurrentMarkup)
savedCurrentMarkup->swap(savedMarkup);
Location savedMarkupLocation(markupLocation());
startMarkup(markupPtr != 0, startLocation);
if (markupPtr) {
markupPtr->addDelim(Syntax::dGRPO);
markupPtr->swap(*currentMarkup());
}
Boolean ignore;
if (!parseEntityReferenceNameGroup(ignore))
return 0;
if (markupPtr)
currentMarkup()->swap(*markupPtr);
startMarkup(savedCurrentMarkup != 0, savedMarkupLocation);
if (savedCurrentMarkup)
savedMarkup.swap(*currentMarkup());
if (!ignore)
ignoreLevel = 0;
in->startToken();
Xchar c = in->tokenChar(messenger());
if (!syntax().isNameStartCharacter(c)) {
message(ParserMessages::entityReferenceMissingName);
return 0;
}
}
in->discardInitial();
if (isParameter)
extendNameToken(syntax().penamelen(), ParserMessages::parameterEntityNameLength);
else
extendNameToken(syntax().namelen(), ParserMessages::nameLength);
StringC &name = nameBuffer();
getCurrentToken(syntax().entitySubstTable(), name);
if (ignoreLevel)
entity = new IgnoredEntity(name,
isParameter
? Entity::parameterEntity
: Entity::generalEntity);
else {
entity = lookupEntity(isParameter, name, startLocation, 1);
if (entity.isNull()) {
if (haveApplicableDtd()) {
if (!isParameter) {
entity = createUndefinedEntity(name, startLocation);
message(ParserMessages::entityUndefined,
StringMessageArg(name));
}
else
message(ParserMessages::parameterEntityUndefined,
StringMessageArg(name));
}
else
message(ParserMessages::entityApplicableDtd);
}
else if (entity->defaulted() && options().warnDefaultEntityReference)
message(ParserMessages::defaultEntityReference, StringMessageArg(name));
}
if (markupPtr) {
markupPtr->addName(in);
switch (getToken(refMode)) {
case tokenRefc:
markupPtr->addDelim(Syntax::dREFC);
break;
case tokenRe:
markupPtr->addRefEndRe();
if (options().warnRefc)
message(ParserMessages::refc);
break;
default:
if (options().warnRefc)
message(ParserMessages::refc);
break;
}
}
else if (options().warnRefc) {
if (getToken(refMode) != tokenRefc)
message(ParserMessages::refc);
}
else
(void)getToken(refMode);
if (!entity.isNull())
origin = EntityOrigin::make(internalAllocator(),
entity,
startLocation,
currentLocation().index()
+ currentInput()->currentTokenLength()
- startLocation.index(),
markupPtr);
else
origin = (EntityOrigin *)0;
return 1;
}
Boolean Parser::parseComment(Mode mode)
{
Location startLoc(currentLocation());
Markup *markup = currentMarkup();
if (markup)
markup->addCommentStart();
Token token;
while ((token = getToken(mode)) != tokenCom)
switch (token) {
case tokenUnrecognized:
if (!reportNonSgmlCharacter())
message(ParserMessages::sdCommentSignificant,
StringMessageArg(currentToken()));
break;
case tokenEe:
message(ParserMessages::commentEntityEnd, startLoc);
return 0;
default:
if (markup)
markup->addCommentChar(currentChar());
break;
}
return 1;
}
void Parser::extendNameToken(size_t maxLength,
const MessageType1 &tooLongMessage)
{
InputSource *in = currentInput();
size_t length = in->currentTokenLength();
const Syntax &syn = syntax();
while (syn.isNameCharacter(in->tokenChar(messenger())))
length++;
if (length > maxLength)
message(tooLongMessage, NumberMessageArg(maxLength));
in->endToken(length);
}
void Parser::extendNumber(size_t maxLength, const MessageType1 &tooLongMessage)
{
InputSource *in = currentInput();
size_t length = in->currentTokenLength();
while (syntax().isDigit(in->tokenChar(messenger())))
length++;
if (length > maxLength)
message(tooLongMessage, NumberMessageArg(maxLength));
in->endToken(length);
}
void Parser::extendHexNumber()
{
InputSource *in = currentInput();
size_t length = in->currentTokenLength();
while (syntax().isHexDigit(in->tokenChar(messenger())))
length++;
if (length > syntax().namelen())
message(ParserMessages::hexNumberLength, NumberMessageArg(syntax().namelen()));
in->endToken(length);
}
Boolean Parser::reportNonSgmlCharacter()
{
// In scanSuppress mode the non-SGML character will have been read.
Char c = currentInput()->currentTokenLength() ? currentChar() : getChar();
if (!syntax().isSgmlChar(c)) {
message(ParserMessages::nonSgmlCharacter, NumberMessageArg(c));
return 1;
}
return 0;
}
void Parser::extendS()
{
InputSource *in = currentInput();
size_t length = in->currentTokenLength();
while (syntax().isS(in->tokenChar(messenger())))
length++;
in->endToken(length);
}
#ifdef SP_NAMESPACE
}
#endif