diff --git a/ChangeLog b/ChangeLog index 9d589b74..9f8a3ab4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +Thu Jan 2 13:57:07 CET 2003 Daniel Veillard + + * libxml.spec.in python/Makefile.am python/drv_libxml2.py: + integrated drv_libxml2.py Python xml.sax driver from Stéphane Bidoul + based on the python XmlTextReader interface. + Wed Jan 1 22:05:40 CET 2003 Daniel Veillard * tree.c: backing out one change in the last patch which broke the diff --git a/libxml.spec.in b/libxml.spec.in index c8e051be..c4f12288 100644 --- a/libxml.spec.in +++ b/libxml.spec.in @@ -129,6 +129,7 @@ rm -fr %{buildroot} %doc AUTHORS ChangeLog NEWS README Copyright %{_libdir}/python*/site-packages/libxml2.py +%{_libdir}/python*/site-packages/drv_libxml2.py %{_libdir}/python*/site-packages/libxml2mod* %doc python/TODO %doc python/libxml2class.txt @@ -140,6 +141,10 @@ rm -fr %{buildroot} * @RELDATE@ Daniel Veillard - upstream release @VERSION@ see http://xmlsoft.org/news.html +* Thu Jan 2 2003 Daniel Veillard +- integrated drv_libxml2 xml.sax driver from Stéphane Bidoul +- provides the new XmlTextReader interfaces based on C# XML APIs + * Wed Oct 23 2002 Daniel Veillard - revamped the spec file, cleaned up some rpm building problems diff --git a/python/Makefile.am b/python/Makefile.am index 5b4769ae..0fb1e316 100644 --- a/python/Makefile.am +++ b/python/Makefile.am @@ -19,6 +19,7 @@ EXTRA_DIST = \ generator.py \ libxml_wrap.h \ libxml.py \ + drv_libxml.py \ libxml2-python-api.xml \ $(DOCS) @@ -42,6 +43,7 @@ libxml2.py: $(srcdir)/libxml.py $(srcdir)/libxml2class.py install-data-local: $(mkinstalldirs) $(DESTDIR)$(libdir)/python${PYTHON_VERSION}/site-packages @INSTALL@ -m 0644 libxml2.py $(DESTDIR)$(libdir)/python${PYTHON_VERSION}/site-packages + @INSTALL@ -m 0644 drv_libxml2.py $(DESTDIR)$(libdir)/python${PYTHON_VERSION}/site-packages $(mkinstalldirs) $(DESTDIR)$(DOCS_DIR) @(for doc in $(DOCS) ; \ do @INSTALL@ -m 0644 $$doc $(DESTDIR)$(DOCS_DIR) ; done) diff --git a/python/drv_libxml2.py b/python/drv_libxml2.py new file mode 100644 index 00000000..514aa899 --- /dev/null +++ b/python/drv_libxml2.py @@ -0,0 +1,349 @@ +""" A SAX2 driver for libxml2, on top of it's XmlReader API + +USAGE + # put this file (drv_libxml2.py) in PYTHONPATH + import xml.sax + reader = xml.sax.make_parser(["drv_libxml2"]) + # ...and the rest is standard python sax. + +CAVEATS + - Lexical handlers are supported, except for start/endEntity + (waiting for XmlReader.ResolveEntity) and start/endDTD + - as understand it, libxml2 error handlers are globals (per thread); + each call to parse() registers a new error handler, + overwriting any previously registered handler + --> you can't have 2 LibXml2Reader active at the same time + +TODO + - search for TODO + - some ErrorHandler events (warning) + - some ContentHandler events (setDocumentLocator, skippedEntity) + - EntityResolver (using libxml2.?) + - DTDHandler (if/when libxml2 exposes such node types) + - DeclHandler (if/when libxml2 exposes such node types) + - property_xml_string? + - feature_string_interning? + - Incremental parser + - additional performance tuning: + - one might cache callbacks to avoid some name lookups + - one might implement a smarter way to pass attributes to startElement + (some kind of lazy evaluation?) + - there might be room for improvement in start/endPrefixMapping + - other? + +""" + +__author__ = u"Stéphane Bidoul " +__version__ = "0.1" + +import codecs +from types import StringTypes + +from xml.sax._exceptions import * +from xml.sax import xmlreader, saxutils +from xml.sax.handler import \ + feature_namespaces, \ + feature_namespace_prefixes, \ + feature_string_interning, \ + feature_validation, \ + feature_external_ges, \ + feature_external_pes, \ + property_lexical_handler, \ + property_declaration_handler, \ + property_dom_node, \ + property_xml_string + +# libxml2 returns strings as UTF8 +_decoder = codecs.getdecoder("utf8") +def _d(s): + if s is None: + return s + else: + return _decoder(s)[0] + +try: + import libxml2 +except ImportError, e: + raise SAXReaderNotAvailable("libxml2 not available: " + e) + +try: + import libxslt +except ImportError: + # normal behaviour + def _registerErrorHandler(handler): + libxml2.registerErrorHandler(handler,"drv_libxml") +else: + # work around libxslt bindings bug (libxml2 bug #102181) + def _registerErrorHandler(handler): + libxml2.registerErrorHandler(handler,"drv_libxml") + libxslt.registerErrorHandler(handler,"drv_libxml") + +class LibXml2Reader(xmlreader.XMLReader): + + def __init__(self): + xmlreader.XMLReader.__init__(self) + # features + self.__ns = 0 + self.__nspfx = 0 + self.__validate = 0 + # parsing flag + self.__parsing = 0 + # additional handlers + self.__lex_handler = None + self.__decl_handler = None + # error messages accumulator + self.__errors = None + + def _errorHandler(self,ctx,str): + if self.__errors is None: + self.__errors = [] + self.__errors.append(str) + + def _reportError(self,callback): + # TODO: use SAXParseException, but we need a Locator for that + # TODO: distinguish warnings from errors + msg = "".join(self.__errors) + self.__errors = None + callback(SAXException(msg)) + + def parse(self, source): + self.__parsing = 1 + _registerErrorHandler(self._errorHandler) + try: + # prepare source and create reader + if type(source) in StringTypes: + reader = libxml2.newTextReaderFilename(source) + else: + source = saxutils.prepare_input_source(source) + input = libxml2.inputBuffer(source.getByteStream()) + reader = input.newTextReader(source.getSystemId()) + # configure reader + reader.SetParserProp(libxml2.PARSER_LOADDTD,1) + reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1) + reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1) + reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate) + # we reuse attribute maps (for a slight performance gain) + if self.__ns: + attributesNSImpl = xmlreader.AttributesNSImpl({},{}) + else: + attributesImpl = xmlreader.AttributesImpl({}) + # prefixes to pop (for endPrefixMapping) + prefixes = [] + # start loop + self._cont_handler.startDocument() + while 1: + r = reader.Read() + # check for errors + if r == 1: + if not self.__errors is None: + # non-fatal error + self._reportError(self._err_handler.error) + elif r == 0: + if not self.__errors is None: + # non-fatal error + self._reportError(self._err_handler.error) + break + else: + # fatal error + if not self.__errors is None: + self._reportError(self._err_handler.fatalError) + else: + self._err_handler.fatalError(\ + SAXException("Read failed (no details available)")) + break + # get node type + nodeType = reader.NodeType() + # Element + if nodeType == 1: + if self.__ns: + eltName = (_d(reader.NamespaceUri()),\ + _d(reader.LocalName())) + eltQName = _d(reader.Name()) + attributesNSImpl._attrs = attrs = {} + attributesNSImpl._qnames = qnames = {} + newPrefixes = [] + while reader.MoveToNextAttribute(): + qname = _d(reader.Name()) + value = _d(reader.Value()) + if qname.startswith("xmlns"): + if len(qname) > 5: + newPrefix = qname[6:] + else: + newPrefix = None + newPrefixes.append(newPrefix) + self._cont_handler.startPrefixMapping(\ + newPrefix,value) + if not self.__nspfx: + continue # don't report xmlns attribute + attName = (_d(reader.NamespaceUri()), + _d(reader.LocalName())) + qnames[attName] = qname + attrs[attName] = value + self._cont_handler.startElementNS( \ + eltName,eltQName,attributesNSImpl) + if reader.IsEmptyElement(): + self._cont_handler.endElementNS(eltName,eltQName) + for newPrefix in newPrefixes: + self._cont_handler.endPrefixMapping(newPrefix) + else: + prefixes.append(newPrefixes) + else: + eltName = _d(reader.Name()) + attributesImpl._attrs = attrs = {} + while reader.MoveToNextAttribute(): + attName = _d(reader.Name()) + attrs[attName] = _d(reader.Value()) + self._cont_handler.startElement( \ + eltName,attributesImpl) + if reader.IsEmptyElement(): + self._cont_handler.endElement(eltName) + # EndElement + elif nodeType == 15: + if self.__ns: + self._cont_handler.endElementNS( \ + (_d(reader.NamespaceUri()),_d(reader.LocalName())), + _d(reader.Name())) + for prefix in prefixes.pop(): + self._cont_handler.endPrefixMapping(prefix) + else: + self._cont_handler.endElement(_d(reader.Name())) + # Text + elif nodeType == 3: + self._cont_handler.characters(_d(reader.Value())) + # Whitespace + elif nodeType == 13: + self._cont_handler.ignorableWhitespace(_d(reader.Value())) + # SignificantWhitespace + elif nodeType == 14: + self._cont_handler.characters(_d(reader.Value())) + # CDATA + elif nodeType == 4: + if not self.__lex_handler is None: + self.__lex_handler.startCDATA() + self._cont_handler.characters(_d(reader.Value())) + if not self.__lex_handler is None: + self.__lex_handler.endCDATA() + # EntityReference + elif nodeType == 5: + if not self.__lex_handler is None: + self.startEntity(_d(reader.Name())) + reader.ResolveEntity() + # EndEntity + elif nodeType == 16: + if not self.__lex_handler is None: + self.endEntity(_d(reader.Name())) + # ProcessingInstruction + elif nodeType == 7: + self._cont_handler.processingInstruction( \ + _d(reader.Name()),_d(reader.Value())) + # Comment + elif nodeType == 8: + if not self.__lex_handler is None: + self.__lex_handler.comment(_d(reader.Value())) + # DocumentType + elif nodeType == 10: + #if not self.__lex_handler is None: + # self.__lex_handler.startDTD() + pass # TODO (how to detect endDTD? on first non-dtd event?) + # XmlDeclaration + elif nodeType == 17: + pass # TODO + # Entity + elif nodeType == 6: + pass # TODO (entity decl) + # Notation (decl) + elif nodeType == 12: + pass # TODO + # Attribute (never in this loop) + #elif nodeType == 2: + # pass + # Document (not exposed) + #elif nodeType == 9: + # pass + # DocumentFragment (never returned by XmlReader) + #elif nodeType == 11: + # pass + # None + #elif nodeType == 0: + # pass + # - + else: + raise SAXException("Unexpected node type %d" % nodeType) + if r == 0: + self._cont_handler.endDocument() + reader.Close() + finally: + self.__parsing = 0 + # TODO: unregister error handler? + + def setDTDHandler(self, handler): + # TODO (when supported, the inherited method works just fine) + raise SAXNotSupportedException("DTDHandler not supported") + + def setEntityResolver(self, resolver): + # TODO (when supported, the inherited method works just fine) + raise SAXNotSupportedException("EntityResolver not supported") + + def getFeature(self, name): + if name == feature_namespaces: + return self.__ns + elif name == feature_namespace_prefixes: + return self.__nspfx + elif name == feature_validation: + return self.__validate + elif name == feature_external_ges: + return 1 # TODO (does that relate to PARSER_LOADDTD)? + elif name == feature_external_pes: + return 1 # TODO (does that relate to PARSER_LOADDTD)? + else: + raise SAXNotRecognizedException("Feature '%s' not recognized" % \ + name) + + def setFeature(self, name, state): + if self.__parsing: + raise SAXNotSupportedException("Cannot set feature %s " \ + "while parsing" % name) + if name == feature_namespaces: + self.__ns = state + elif name == feature_namespace_prefixes: + self.__nspfx = state + elif name == feature_validation: + self.__validate = state + elif name == feature_external_ges: + if state == 0: + # TODO (does that relate to PARSER_LOADDTD)? + raise SAXNotSupportedException("Feature '%s' not supported" % \ + name) + elif name == feature_external_pes: + if state == 0: + # TODO (does that relate to PARSER_LOADDTD)? + raise SAXNotSupportedException("Feature '%s' not supported" % \ + name) + else: + raise SAXNotRecognizedException("Feature '%s' not recognized" % \ + name) + + def getProperty(self, name): + if name == property_lexical_handler: + return self.__lex_handler + elif name == property_declaration_handler: + return self.__decl_handler + else: + raise SAXNotRecognizedException("Property '%s' not recognized" % \ + name) + + def setProperty(self, name, value): + if name == property_lexical_handler: + self.__lex_handler = value + elif name == property_declaration_handler: + # TODO: remove if/when libxml2 supports dtd events + raise SAXNotSupportedException("Property '%s' not supported" % \ + name) + self.__decl_handler = value + else: + raise SAXNotRecognizedException("Property '%s' not recognized" % \ + name) + +def create_parser(): + return LibXml2Reader() +