# -*- coding: iso-8859-1 -*-
""" A SAX2 driver for libxml2, on top of it's XmlReader API
USAGE
# put this file (drv_libxml2.py) in PYTHONPATH
import xml.sax
reader = xml.sax.make_parser(["drv_libxml2"])
# ...and the rest is standard python sax.
CAVEATS
- Error callbacks are not exactly synchronous, they tend
to be invoked before the corresponding content callback,
because the underlying reader interface parses
data by chunks of 512 bytes
TODO
- search for TODO
- some ErrorHandler events (warning)
- some ContentHandler events (setDocumentLocator, skippedEntity)
- EntityResolver (using libxml2.?)
- property_xml_string?
- feature_string_interning?
- Incremental parser
- additional performance tuning:
- one might cache callbacks to avoid some name lookups
- one might implement a smarter way to pass attributes to startElement
(some kind of lazy evaluation?)
- there might be room for improvement in start/endPrefixMapping
- other?
"""
import codecs
# libxml2 returns strings as UTF8
def _d(s):
if s is None:
return s
else:
return _decoder(s)[0]
try:
import libxml2
except ImportError, e:
raise SAXReaderNotAvailable("libxml2 not available: " \
"import error was: %s" % e)
"""SAX Locator adapter for libxml2.xmlTextReaderLocator"""
"Return the column number where the current event ends."
return -1
"Return the line number where the current event ends."
"Return the public identifier for the current event."
return None
"Return the system identifier for the current event."
# features
# parsing flag
# additional handlers
self.__lex_handler = None
self.__decl_handler = None
# error messages accumulator
SAXParseException(msg,None,
else:
# when fatal is set, the parse will stop;
# we consider that the last error reported
# is the fatal one.
else:
try:
# prepare source and create reader
else:
# configure reader
if self.__extparams:
else:
# we reuse attribute maps (for a slight performance gain)
else:
# prefixes to pop (for endPrefixMapping)
prefixes = []
# start loop
while 1:
# check for errors
if r == 1:
elif r == 0:
break # end of parse
else:
else:
SAXException("Read failed (no details available)"))
break # fatal parse error
# get node type
# Element
if nodeType == 1:
newPrefixes = []
while reader.MoveToNextAttribute():
else:
newPrefix = None
continue # don't report xmlns attribute
if reader.IsEmptyElement():
for newPrefix in newPrefixes:
else:
else:
while reader.MoveToNextAttribute():
if reader.IsEmptyElement():
# EndElement
elif nodeType == 15:
else:
# Text
elif nodeType == 3:
# Whitespace
elif nodeType == 13:
# SignificantWhitespace
elif nodeType == 14:
# CDATA
elif nodeType == 4:
if not self.__lex_handler is None:
if not self.__lex_handler is None:
# EntityReference
elif nodeType == 5:
if not self.__lex_handler is None:
# EndEntity
elif nodeType == 16:
if not self.__lex_handler is None:
# ProcessingInstruction
elif nodeType == 7:
# Comment
elif nodeType == 8:
if not self.__lex_handler is None:
# DocumentType
elif nodeType == 10:
#if not self.__lex_handler is None:
# self.__lex_handler.startDTD()
pass # TODO (how to detect endDTD? on first non-dtd event?)
# XmlDeclaration
elif nodeType == 17:
pass # TODO
# Entity
elif nodeType == 6:
pass # TODO (entity decl)
# Notation (decl)
elif nodeType == 12:
pass # TODO
# Attribute (never in this loop)
#elif nodeType == 2:
# pass
# Document (not exposed)
#elif nodeType == 9:
# pass
# DocumentFragment (never returned by XmlReader)
#elif nodeType == 11:
# pass
# None
#elif nodeType == 0:
# pass
# -
else:
if r == 0:
finally:
# TODO (when supported, the inherited method works just fine)
raise SAXNotSupportedException("DTDHandler not supported")
# TODO (when supported, the inherited method works just fine)
raise SAXNotSupportedException("EntityResolver not supported")
if name == feature_namespaces:
elif name == feature_namespace_prefixes:
elif name == feature_validation:
return self.__validate
elif name == feature_external_ges:
return 1 # TODO (does that relate to PARSER_LOADDTD)?
elif name == feature_external_pes:
return self.__extparams
else:
raise SAXNotRecognizedException("Feature '%s' not recognized" % \
name)
raise SAXNotSupportedException("Cannot set feature %s " \
"while parsing" % name)
if name == feature_namespaces:
elif name == feature_namespace_prefixes:
elif name == feature_validation:
elif name == feature_external_ges:
if state == 0:
# TODO (does that relate to PARSER_LOADDTD)?
raise SAXNotSupportedException("Feature '%s' not supported" % \
name)
elif name == feature_external_pes:
else:
raise SAXNotRecognizedException("Feature '%s' not recognized" % \
name)
if name == property_lexical_handler:
return self.__lex_handler
elif name == property_declaration_handler:
return self.__decl_handler
else:
raise SAXNotRecognizedException("Property '%s' not recognized" % \
name)
if name == property_lexical_handler:
elif name == property_declaration_handler:
raise SAXNotSupportedException("Property '%s' not supported" % \
name)
else:
raise SAXNotRecognizedException("Property '%s' not recognized" % \
name)
def create_parser():
return LibXml2Reader()