a bit of cleanup small fix more work on the XmlTextReader tutorial a few

* xmllint.c: a bit of cleanup * xmlreader.c: small fix * doc/xmlreader.html: more work on the XmlTextReader tutorial * python/libxml.py: a few fixes pointed out by Hannu Krosing Daniel
2025-07-29 11:41:22 +03:00 · 2003-01-04 16:35:29 +00:00
parent 623a9eb2df
commit e59494fa54
5 changed files with 218 additions and 39 deletions
--- a/7
+++ b/7
@ -1,3 +1,10 @@
+Sat Jan  4 17:33:17 CET 2003 Daniel Veillard <daniel@veillard.com>
+
+	* xmllint.c: a bit of cleanup
+	* xmlreader.c: small fix
+	* doc/xmlreader.html: more work on the XmlTextReader tutorial
+	* python/libxml.py: a few fixes pointed out by Hannu Krosing
+
 Sat Jan  4 13:46:14 CET 2003 Daniel Veillard <daniel@veillard.com>

 	* python/setup.py.in: patch from St<53>phane Bidoul to include
--- a/doc/xmlreader.html
+++ b/doc/xmlreader.html
@ -23,7 +23,7 @@ A:link, A:visited, A:active { text-decoration: underline }-->
 <p></p>

 <p>This document describes the use of the XmlTextReader streaming API added
-to libxml2 in version 2.5.0 . This API is closely modelled on the <a
+to libxml2 in version 2.5.0 . This API is closely modeled after the <a
 href="http://dotgnu.org/pnetlib-doc/System/Xml/XmlTextReader.html">XmlTextReader</a>
 and <a
 href="http://dotgnu.org/pnetlib-doc/System/Xml/XmlReader.html">XmlReader</a>
@ -38,6 +38,8 @@ examples using both C and the Python bindings:</p>
  <li><a href="#Walking">Walking a simple tree</a></li>
  <li><a href="#Extracting">Extracting informations for the current
  node</a></li>
+  <li><a href="#Extracting1">Extracting informations for the
+  attributes</a></li>
  <li><a href="#Validating">Validating a document</a></li>
  <li><a href="#Entities">Entities substitution</a></li>
 </ul>
@ -132,26 +134,28 @@ int streamFile(char *filename) {
 def processNode(reader):
    pass

-try:
-    reader = newTextReaderFilename(filename)
-except:
+def streamFile(filename):
+    try:
+        reader = libxml2.newTextReaderFilename(filename)
+    except:
        print "unable to open %s" % (filename)
+        return

-
-ret = reader.Read()
-while ret == 1:
+    ret = reader.Read()
+    while ret == 1:
        processNode(reader)
        ret = reader.Read()
-if ret != 0:
+
+    if ret != 0:
        print "%s : failed to parse" % (filename)
 </pre>

 <p>The only things worth adding are that the <a
 href="http://dotgnu.org/pnetlib-doc/System/Xml/XmlTextReader.html">xmlTextReader
 is abstracted as a class like in C#</a> with the same method names (but the
-properties are currently accessed with methods) and to note one doesn't need
-to free the reader at the end of the processing, it will get garbage
-collected once all references have disapeared</p>
+properties are currently accessed with methods) and that one doesn't need to
+free the reader at the end of the processing, it will get garbage collected
+once all references have disapeared</p>

 <h2><a name="Extracting">Extracting informations for the current node</a></h2>

@ -206,10 +210,184 @@ XmlTextReader class</a> set of properties and methods:</p>
    current node.</li>
 </ul>

-<p></p>
+<p>Let's look first at a small example to get this in practice by redefining
+the processNode() function in the Python example:</p>
+<pre>def processNode(reader):
+    print "%d %d %s %d" % (reader.Depth(), reader.NodeType(),
+                           reader.Name(), reader.IsEmptyElement())</pre>
+
+<p>and look at the result of calling streamFile("tst.xml") for various
+content of the XML test file.</p>
+
+<p>For the minimal document "<code>&lt;doc/&gt;</code>" we get:</p>
+<pre>0 1 doc 1</pre>
+
+<p>Only one node is found, its depth is 0, type 1 indocate an element start,
+of name "doc" and it is empty. Trying now with
+"<code>&lt;doc&gt;&lt;/doc&gt;</code>" instead leads to:</p>
+<pre>0 1 doc 0
+0 15 doc 0</pre>
+
+<p>The document root node is not flagged as empty anymore and both a start
+and an end of element are detected. The following document shows how
+character data are reported:</p>
+<pre>&lt;doc&gt;&lt;a/&gt;&lt;b&gt;some text&lt;/b&gt;
+&lt;c/&gt;&lt;/doc&gt;</pre>
+
+<p>We modifying the processNode() function to also report the node Value:</p>
+<pre>def processNode(reader):
+    print "%d %d %s %d %s" % (reader.Depth(), reader.NodeType(),
+                              reader.Name(), reader.IsEmptyElement(),
+                              reader.Value())</pre>
+
+<p>The result of the test is:</p>
+<pre>0 1 doc 0 None
+1 1 a 1 None
+1 1 b 0 None
+2 3 #text 0 some text
+1 15 b 0 None
+1 3 #text 0
+
+1 1 c 1 None
+0 15 doc 0 None</pre>
+
+<p>There is a few things to note:</p>
+<ul>
+  <li>the increase of the depth value (first row) as children nodes are
+    explored</li>
+  <li>the text node child of the b element, of type 3 and its content</li>
+  <li>the text node containing the line return between elements b and c</li>
+  <li>that elements have the Value None (or NULL in C)</li>
+</ul>
+
+<p>The equivalent routine for <code>processNode()</code> as used by
+<code>xmllint --stream --debug</code> is the following and can be found in
+the xmllint.c module in the source distribution:</p>
+<pre>static void processNode(xmlTextReaderPtr reader) {
+    xmlChar *name, *value;
+
+    name = xmlTextReaderName(reader);
+    if (name == NULL)
+        name = xmlStrdup(BAD_CAST "--");
+    value = xmlTextReaderValue(reader);
+
+    printf("%d %d %s %d",
+            xmlTextReaderDepth(reader),
+            xmlTextReaderNodeType(reader),
+            name,
+            xmlTextReaderIsEmptyElement(reader));
+    xmlFree(name);
+    if (value == NULL)
+        printf("\n");
+    else {
+        printf(" %s\n", value);
+        xmlFree(value);
+    }
+}</pre>
+
+<h2><a name="Extracting1">Extracting informations for the attributes</a></h2>
+
+<p>The previous examples don't indicate how attributes are processed. The
+simple test "<code>&lt;doc a="b"/&gt;</code>" provides the following
+result:</p>
+<pre>0 1 doc 1 None</pre>
+
+<p>This prove that attributes nodes are not traversed by default. The
+<em>HasAttributes</em> property allow to detect their presence. To check
+their content the API has special instructions basically 2 kind of operations
+are possible:</p>
+<ol>
+  <li>to move the reader to the attribute nodes of the current element, in
+    that case the cursor is positionned on the attribute node</li>
+  <li>to directly query the element node for the attribute value</li>
+</ol>
+
+<p>In both case the attribute can be designed either by its position in the
+list of attribute (<em>MoveToAttributeNo</em> or <em>GetAttributeNo</em>) or
+by their name (and namespace):</p>
+<ul>
+  <li><em>GetAttributeNo</em>(no): provides the value of the attribute with
+    the specified index no relative to the containing element.</li>
+  <li><em>GetAttribute</em>(name): provides the value of the attribute with
+    the specified qualified name.</li>
+  <li>GetAttributeNs(localName, namespaceURI): provides the value of the
+    attribute with the specified local name and namespace URI.</li>
+  <li><em>MoveToAttributeNo</em>(no): moves the position of the current
+    instance to the attribute with the specified index relative to the
+    containing element.</li>
+  <li><em>MoveToAttribute</em>(name): moves the position of the current
+    instance to the attribute with the specified qualified name.</li>
+  <li><em>MoveToAttributeNs</em>(localName, namespaceURI): moves the position
+    of the current instance to the attribute with the specified local name
+    and namespace URI.</li>
+  <li><em>MoveToFirstAttribute</em>: moves the position of the current
+    instance to the first attribute associated with the current node.</li>
+  <li><em>MoveToNextAttribute</em>: moves the position of the current
+    instance to the next attribute associated with the current node.</li>
+  <li><em>MoveToElement</em>: moves the position of the current instance to
+    the node that contains the current Attribute  node.</li>
+</ul>
+
+<p>After modifying the processNode() function to show attributes:</p>
+<pre>def processNode(reader):
+    print "%d %d %s %d %s" % (reader.Depth(), reader.NodeType(),
+                              reader.Name(), reader.IsEmptyElement(),
+                              reader.Value())
+    if reader.NodeType() == 1: # Element
+        while reader.MoveToNextAttribute():
+            print "-- %d %d (%s) [%s]" % (reader.Depth(), reader.NodeType(),
+                                          reader.Name(),reader.Value())</pre>
+
+<p>the output for the same input document reflects the attribute:</p>
+<pre>0 1 doc 1 None
+-- 1 2 (a) [b]</pre>
+
+<p>There is a couple of things to note on the attribute processing:</p>
+<ul>
+  <li>their depth is the one of the carrying element plus one</li>
+  <li>namespace declarations are seen as attributes like in DOM</li>
+</ul>

 <h2><a name="Validating">Validating a document</a></h2>

+<p>Libxml2 implementation adds some extra feature on top of the XmlTextReader
+API, the main one is the ability to DTD validate the parsed document
+progressively. This is simply the activation of the associated feature of the
+parser used by the reader structure. There are a few options available
+defined as the enum xmlParserProperties in the libxml/xmlreader.h header
+file:</p>
+<ul>
+  <li>XML_PARSER_LOADDTD: force loading the DTD (without validating)</li>
+  <li>XML_PARSER_DEFAULTATTRS: force attribute defaulting (this also imply
+    loading the DTD)</li>
+  <li>XML_PARSER_VALIDATE: activate DTD validation (this also imply loading
+    the DTD)</li>
+  <li>XML_PARSER_SUBST_ENTITIES: substitute entities on the fly, entity
+    reference nodes are not generated and are replaced by their expanded
+    content.</li>
+  <li>more settings might be added, those were the one available at the 2.5.0
+    release...</li>
+</ul>
+
+<p>The GetParserProp() and SetParserProp() methods can then be used to get
+and set the values of those parser properties of the reader. For example</p>
+<pre>def parseAndValidate(file):
+    reader = libxml2.newTextReaderFilename(file)
+    reader.SetParserProp(libxml2.PARSER_VALIDATE, 1)
+    ret = reader.Read()
+    while ret == 1:
+        ret = reader.Read()
+    if ret != 0:
+        print "Error parsing and validating %s" % (file)</pre>
+
+<p>This routine will parse and validate the file. Errors message can be
+captured by registering an error handler. See python/tests/reader2.py for
+more complete Python examples. At the C level the equivalent call to cativate
+the validation feature is just:</p>
+<pre>ret = xmlTextReaderSetParserProp(reader, XML_PARSER_VALIDATE, 1)</pre>
+
+<p>and a return value of 0 indicates success.</p>
+
 <h2><a name="Entities">Entities substitution</a></h2>

 <p> </p>
--- a/python/libxml.py
+++ b/python/libxml.py
@ -302,11 +302,6 @@ class xmlCore:
        return libxml2mod.name(self._o)
    def get_type(self):
        return libxml2mod.type(self._o)
-    def get_doc(self):
-        ret = libxml2mod.doc(self._o)
-        if ret == None:
-            return None
-        return xmlDoc(_obj=ret)
    def free(self):
        libxml2mod.freeDoc(self._o)

@ -356,7 +351,7 @@ def nodeWrap(o):
    if name == "entity_decl":
        return xmlEntity(_obj=o)
    if name == "dtd":
-        return xmlAttr(_obj=o)
+        return xmlDtd(_obj=o)
    return xmlNode(_obj=o)

 def xpathObjectRet(o):
--- a/xmllint.c
+++ b/xmllint.c
@ -572,7 +572,6 @@ static int count = 0;
 static int elem, attrs;

 static void processNode(xmlTextReaderPtr reader) {
-    if (debug) {
    xmlChar *name, *value;

    name = xmlTextReaderName(reader);
@ -580,11 +579,11 @@ static void processNode(xmlTextReaderPtr reader) {
 	name = xmlStrdup(BAD_CAST "--");
    value = xmlTextReaderValue(reader);

-	printf("%d %d %d %s", 
+    printf("%d %d %s %d", 
 	    xmlTextReaderDepth(reader),
 	    xmlTextReaderNodeType(reader),
-		xmlTextReaderIsEmptyElement(reader),
-		name);
+	    name,
+	    xmlTextReaderIsEmptyElement(reader));
    xmlFree(name);
    if (value == NULL)
 	printf("\n");
@ -592,7 +591,6 @@ static void processNode(xmlTextReaderPtr reader) {
 	printf(" %s\n", value);
 	xmlFree(value);
    }
-    }
 }

 static void streamFile(char *filename) {
--- a/xmlreader.c
+++ b/xmlreader.c
@ -602,6 +602,7 @@ xmlTextReaderRead(xmlTextReaderPtr reader) {
 	    reader->state = XML_TEXTREADER_ELEMENT;
 	} else {
 	    reader->node = reader->ctxt->nodeTab[0];
+	    reader->state = XML_TEXTREADER_ELEMENT;
 	}
 	reader->depth = 0;
 	goto node_found;