From e59494fa5410ced5625687f025f61d30c5b2f933 Mon Sep 17 00:00:00 2001
From: Daniel Veillard This document describes the use of the XmlTextReader streaming API added
-to libxml2 in version 2.5.0 . This API is closely modelled on the XmlTextReader
and XmlReader
@@ -38,6 +38,8 @@ examples using both C and the Python bindings: The only things worth adding are that the xmlTextReader
is abstracted as a class like in C# with the same method names (but the
-properties are currently accessed with methods) and to note one doesn't need
-to free the reader at the end of the processing, it will get garbage
-collected once all references have disapeared
Let's look first at a small example to get this in practice by redefining +the processNode() function in the Python example:
+def processNode(reader): + print "%d %d %s %d" % (reader.Depth(), reader.NodeType(), + reader.Name(), reader.IsEmptyElement())+ +
and look at the result of calling streamFile("tst.xml") for various +content of the XML test file.
+ +For the minimal document "<doc/>
" we get:
0 1 doc 1+ +
Only one node is found, its depth is 0, type 1 indocate an element start,
+of name "doc" and it is empty. Trying now with
+"<doc></doc>
" instead leads to:
0 1 doc 0 +0 15 doc 0+ +
The document root node is not flagged as empty anymore and both a start +and an end of element are detected. The following document shows how +character data are reported:
+<doc><a/><b>some text</b> +<c/></doc>+ +
We modifying the processNode() function to also report the node Value:
+def processNode(reader): + print "%d %d %s %d %s" % (reader.Depth(), reader.NodeType(), + reader.Name(), reader.IsEmptyElement(), + reader.Value())+ +
The result of the test is:
+0 1 doc 0 None +1 1 a 1 None +1 1 b 0 None +2 3 #text 0 some text +1 15 b 0 None +1 3 #text 0 + +1 1 c 1 None +0 15 doc 0 None+ +
There is a few things to note:
+The equivalent routine for processNode()
as used by
+xmllint --stream --debug
is the following and can be found in
+the xmllint.c module in the source distribution:
static void processNode(xmlTextReaderPtr reader) { + xmlChar *name, *value; + + name = xmlTextReaderName(reader); + if (name == NULL) + name = xmlStrdup(BAD_CAST "--"); + value = xmlTextReaderValue(reader); + + printf("%d %d %s %d", + xmlTextReaderDepth(reader), + xmlTextReaderNodeType(reader), + name, + xmlTextReaderIsEmptyElement(reader)); + xmlFree(name); + if (value == NULL) + printf("\n"); + else { + printf(" %s\n", value); + xmlFree(value); + } +}+ +
The previous examples don't indicate how attributes are processed. The
+simple test "<doc a="b"/>
" provides the following
+result:
0 1 doc 1 None+ +
This prove that attributes nodes are not traversed by default. The +HasAttributes property allow to detect their presence. To check +their content the API has special instructions basically 2 kind of operations +are possible:
+In both case the attribute can be designed either by its position in the +list of attribute (MoveToAttributeNo or GetAttributeNo) or +by their name (and namespace):
+After modifying the processNode() function to show attributes:
+def processNode(reader): + print "%d %d %s %d %s" % (reader.Depth(), reader.NodeType(), + reader.Name(), reader.IsEmptyElement(), + reader.Value()) + if reader.NodeType() == 1: # Element + while reader.MoveToNextAttribute(): + print "-- %d %d (%s) [%s]" % (reader.Depth(), reader.NodeType(), + reader.Name(),reader.Value())+ +
the output for the same input document reflects the attribute:
+0 1 doc 1 None +-- 1 2 (a) [b]+ +
There is a couple of things to note on the attribute processing:
+Libxml2 implementation adds some extra feature on top of the XmlTextReader +API, the main one is the ability to DTD validate the parsed document +progressively. This is simply the activation of the associated feature of the +parser used by the reader structure. There are a few options available +defined as the enum xmlParserProperties in the libxml/xmlreader.h header +file:
+The GetParserProp() and SetParserProp() methods can then be used to get +and set the values of those parser properties of the reader. For example
+def parseAndValidate(file): + reader = libxml2.newTextReaderFilename(file) + reader.SetParserProp(libxml2.PARSER_VALIDATE, 1) + ret = reader.Read() + while ret == 1: + ret = reader.Read() + if ret != 0: + print "Error parsing and validating %s" % (file)+ +
This routine will parse and validate the file. Errors message can be +captured by registering an error handler. See python/tests/reader2.py for +more complete Python examples. At the C level the equivalent call to cativate +the validation feature is just:
+ret = xmlTextReaderSetParserProp(reader, XML_PARSER_VALIDATE, 1)+ +
and a return value of 0 indicates success.
+diff --git a/python/libxml.py b/python/libxml.py index a1b5a7bb..6203e3d5 100644 --- a/python/libxml.py +++ b/python/libxml.py @@ -302,11 +302,6 @@ class xmlCore: return libxml2mod.name(self._o) def get_type(self): return libxml2mod.type(self._o) - def get_doc(self): - ret = libxml2mod.doc(self._o) - if ret == None: - return None - return xmlDoc(_obj=ret) def free(self): libxml2mod.freeDoc(self._o) @@ -356,7 +351,7 @@ def nodeWrap(o): if name == "entity_decl": return xmlEntity(_obj=o) if name == "dtd": - return xmlAttr(_obj=o) + return xmlDtd(_obj=o) return xmlNode(_obj=o) def xpathObjectRet(o): diff --git a/xmllint.c b/xmllint.c index 41dac2fd..39165fd7 100644 --- a/xmllint.c +++ b/xmllint.c @@ -572,26 +572,24 @@ static int count = 0; static int elem, attrs; static void processNode(xmlTextReaderPtr reader) { - if (debug) { - xmlChar *name, *value; + xmlChar *name, *value; - name = xmlTextReaderName(reader); - if (name == NULL) - name = xmlStrdup(BAD_CAST "--"); - value = xmlTextReaderValue(reader); + name = xmlTextReaderName(reader); + if (name == NULL) + name = xmlStrdup(BAD_CAST "--"); + value = xmlTextReaderValue(reader); - printf("%d %d %d %s", - xmlTextReaderDepth(reader), - xmlTextReaderNodeType(reader), - xmlTextReaderIsEmptyElement(reader), - name); - xmlFree(name); - if (value == NULL) - printf("\n"); - else { - printf(" %s\n", value); - xmlFree(value); - } + printf("%d %d %s %d", + xmlTextReaderDepth(reader), + xmlTextReaderNodeType(reader), + name, + xmlTextReaderIsEmptyElement(reader)); + xmlFree(name); + if (value == NULL) + printf("\n"); + else { + printf(" %s\n", value); + xmlFree(value); } } diff --git a/xmlreader.c b/xmlreader.c index 159447a9..0577334b 100644 --- a/xmlreader.c +++ b/xmlreader.c @@ -602,6 +602,7 @@ xmlTextReaderRead(xmlTextReaderPtr reader) { reader->state = XML_TEXTREADER_ELEMENT; } else { reader->node = reader->ctxt->nodeTab[0]; + reader->state = XML_TEXTREADER_ELEMENT; } reader->depth = 0; goto node_found;