From e59494fa5410ced5625687f025f61d30c5b2f933 Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Sat, 4 Jan 2003 16:35:29 +0000 Subject: [PATCH] a bit of cleanup small fix more work on the XmlTextReader tutorial a few * xmllint.c: a bit of cleanup * xmlreader.c: small fix * doc/xmlreader.html: more work on the XmlTextReader tutorial * python/libxml.py: a few fixes pointed out by Hannu Krosing Daniel --- ChangeLog | 7 ++ doc/xmlreader.html | 208 +++++++++++++++++++++++++++++++++++++++++---- python/libxml.py | 7 +- xmllint.c | 34 ++++---- xmlreader.c | 1 + 5 files changed, 218 insertions(+), 39 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9b011702..994c4c80 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +Sat Jan 4 17:33:17 CET 2003 Daniel Veillard + + * xmllint.c: a bit of cleanup + * xmlreader.c: small fix + * doc/xmlreader.html: more work on the XmlTextReader tutorial + * python/libxml.py: a few fixes pointed out by Hannu Krosing + Sat Jan 4 13:46:14 CET 2003 Daniel Veillard * python/setup.py.in: patch from Stéphane Bidoul to include diff --git a/doc/xmlreader.html b/doc/xmlreader.html index d776ec0d..e818a77c 100644 --- a/doc/xmlreader.html +++ b/doc/xmlreader.html @@ -23,7 +23,7 @@ A:link, A:visited, A:active { text-decoration: underline }-->

This document describes the use of the XmlTextReader streaming API added -to libxml2 in version 2.5.0 . This API is closely modelled on the XmlTextReader and XmlReader @@ -38,6 +38,8 @@ examples using both C and the Python bindings:

  • Walking a simple tree
  • Extracting informations for the current node
  • +
  • Extracting informations for the + attributes
  • Validating a document
  • Entities substitution
  • @@ -132,26 +134,28 @@ int streamFile(char *filename) { def processNode(reader): pass -try: - reader = newTextReaderFilename(filename) -except: - print "unable to open %s" % (filename) +def streamFile(filename): + try: + reader = libxml2.newTextReaderFilename(filename) + except: + print "unable to open %s" % (filename) + return - -ret = reader.Read() -while ret == 1: - processNode(reader) ret = reader.Read() -if ret != 0: - print "%s : failed to parse" % (filename) + while ret == 1: + processNode(reader) + ret = reader.Read() + + if ret != 0: + print "%s : failed to parse" % (filename)

    The only things worth adding are that the xmlTextReader is abstracted as a class like in C# with the same method names (but the -properties are currently accessed with methods) and to note one doesn't need -to free the reader at the end of the processing, it will get garbage -collected once all references have disapeared

    +properties are currently accessed with methods) and that one doesn't need to +free the reader at the end of the processing, it will get garbage collected +once all references have disapeared

    Extracting informations for the current node

    @@ -206,10 +210,184 @@ XmlTextReader class set of properties and methods:

    current node. -

    +

    Let's look first at a small example to get this in practice by redefining +the processNode() function in the Python example:

    +
    def processNode(reader):
    +    print "%d %d %s %d" % (reader.Depth(), reader.NodeType(),
    +                           reader.Name(), reader.IsEmptyElement())
    + +

    and look at the result of calling streamFile("tst.xml") for various +content of the XML test file.

    + +

    For the minimal document "<doc/>" we get:

    +
    0 1 doc 1
    + +

    Only one node is found, its depth is 0, type 1 indocate an element start, +of name "doc" and it is empty. Trying now with +"<doc></doc>" instead leads to:

    +
    0 1 doc 0
    +0 15 doc 0
    + +

    The document root node is not flagged as empty anymore and both a start +and an end of element are detected. The following document shows how +character data are reported:

    +
    <doc><a/><b>some text</b>
    +<c/></doc>
    + +

    We modifying the processNode() function to also report the node Value:

    +
    def processNode(reader):
    +    print "%d %d %s %d %s" % (reader.Depth(), reader.NodeType(),
    +                              reader.Name(), reader.IsEmptyElement(),
    +                              reader.Value())
    + +

    The result of the test is:

    +
    0 1 doc 0 None
    +1 1 a 1 None
    +1 1 b 0 None
    +2 3 #text 0 some text
    +1 15 b 0 None
    +1 3 #text 0
    +
    +1 1 c 1 None
    +0 15 doc 0 None
    + +

    There is a few things to note:

    +
      +
    • the increase of the depth value (first row) as children nodes are + explored
    • +
    • the text node child of the b element, of type 3 and its content
    • +
    • the text node containing the line return between elements b and c
    • +
    • that elements have the Value None (or NULL in C)
    • +
    + +

    The equivalent routine for processNode() as used by +xmllint --stream --debug is the following and can be found in +the xmllint.c module in the source distribution:

    +
    static void processNode(xmlTextReaderPtr reader) {
    +    xmlChar *name, *value;
    +
    +    name = xmlTextReaderName(reader);
    +    if (name == NULL)
    +        name = xmlStrdup(BAD_CAST "--");
    +    value = xmlTextReaderValue(reader);
    +
    +    printf("%d %d %s %d",
    +            xmlTextReaderDepth(reader),
    +            xmlTextReaderNodeType(reader),
    +            name,
    +            xmlTextReaderIsEmptyElement(reader));
    +    xmlFree(name);
    +    if (value == NULL)
    +        printf("\n");
    +    else {
    +        printf(" %s\n", value);
    +        xmlFree(value);
    +    }
    +}
    + +

    Extracting informations for the attributes

    + +

    The previous examples don't indicate how attributes are processed. The +simple test "<doc a="b"/>" provides the following +result:

    +
    0 1 doc 1 None
    + +

    This prove that attributes nodes are not traversed by default. The +HasAttributes property allow to detect their presence. To check +their content the API has special instructions basically 2 kind of operations +are possible:

    +
      +
    1. to move the reader to the attribute nodes of the current element, in + that case the cursor is positionned on the attribute node
    2. +
    3. to directly query the element node for the attribute value
    4. +
    + +

    In both case the attribute can be designed either by its position in the +list of attribute (MoveToAttributeNo or GetAttributeNo) or +by their name (and namespace):

    +
      +
    • GetAttributeNo(no): provides the value of the attribute with + the specified index no relative to the containing element.
    • +
    • GetAttribute(name): provides the value of the attribute with + the specified qualified name.
    • +
    • GetAttributeNs(localName, namespaceURI): provides the value of the + attribute with the specified local name and namespace URI.
    • +
    • MoveToAttributeNo(no): moves the position of the current + instance to the attribute with the specified index relative to the + containing element.
    • +
    • MoveToAttribute(name): moves the position of the current + instance to the attribute with the specified qualified name.
    • +
    • MoveToAttributeNs(localName, namespaceURI): moves the position + of the current instance to the attribute with the specified local name + and namespace URI.
    • +
    • MoveToFirstAttribute: moves the position of the current + instance to the first attribute associated with the current node.
    • +
    • MoveToNextAttribute: moves the position of the current + instance to the next attribute associated with the current node.
    • +
    • MoveToElement: moves the position of the current instance to + the node that contains the current Attribute node.
    • +
    + +

    After modifying the processNode() function to show attributes:

    +
    def processNode(reader):
    +    print "%d %d %s %d %s" % (reader.Depth(), reader.NodeType(),
    +                              reader.Name(), reader.IsEmptyElement(),
    +                              reader.Value())
    +    if reader.NodeType() == 1: # Element
    +        while reader.MoveToNextAttribute():
    +            print "-- %d %d (%s) [%s]" % (reader.Depth(), reader.NodeType(),
    +                                          reader.Name(),reader.Value())
    + +

    the output for the same input document reflects the attribute:

    +
    0 1 doc 1 None
    +-- 1 2 (a) [b]
    + +

    There is a couple of things to note on the attribute processing:

    +
      +
    • their depth is the one of the carrying element plus one
    • +
    • namespace declarations are seen as attributes like in DOM
    • +

    Validating a document

    +

    Libxml2 implementation adds some extra feature on top of the XmlTextReader +API, the main one is the ability to DTD validate the parsed document +progressively. This is simply the activation of the associated feature of the +parser used by the reader structure. There are a few options available +defined as the enum xmlParserProperties in the libxml/xmlreader.h header +file:

    +
      +
    • XML_PARSER_LOADDTD: force loading the DTD (without validating)
    • +
    • XML_PARSER_DEFAULTATTRS: force attribute defaulting (this also imply + loading the DTD)
    • +
    • XML_PARSER_VALIDATE: activate DTD validation (this also imply loading + the DTD)
    • +
    • XML_PARSER_SUBST_ENTITIES: substitute entities on the fly, entity + reference nodes are not generated and are replaced by their expanded + content.
    • +
    • more settings might be added, those were the one available at the 2.5.0 + release...
    • +
    + +

    The GetParserProp() and SetParserProp() methods can then be used to get +and set the values of those parser properties of the reader. For example

    +
    def parseAndValidate(file):
    +    reader = libxml2.newTextReaderFilename(file)
    +    reader.SetParserProp(libxml2.PARSER_VALIDATE, 1)
    +    ret = reader.Read()
    +    while ret == 1:
    +        ret = reader.Read()
    +    if ret != 0:
    +        print "Error parsing and validating %s" % (file)
    + +

    This routine will parse and validate the file. Errors message can be +captured by registering an error handler. See python/tests/reader2.py for +more complete Python examples. At the C level the equivalent call to cativate +the validation feature is just:

    +
    ret = xmlTextReaderSetParserProp(reader, XML_PARSER_VALIDATE, 1)
    + +

    and a return value of 0 indicates success.

    +

    Entities substitution

    diff --git a/python/libxml.py b/python/libxml.py index a1b5a7bb..6203e3d5 100644 --- a/python/libxml.py +++ b/python/libxml.py @@ -302,11 +302,6 @@ class xmlCore: return libxml2mod.name(self._o) def get_type(self): return libxml2mod.type(self._o) - def get_doc(self): - ret = libxml2mod.doc(self._o) - if ret == None: - return None - return xmlDoc(_obj=ret) def free(self): libxml2mod.freeDoc(self._o) @@ -356,7 +351,7 @@ def nodeWrap(o): if name == "entity_decl": return xmlEntity(_obj=o) if name == "dtd": - return xmlAttr(_obj=o) + return xmlDtd(_obj=o) return xmlNode(_obj=o) def xpathObjectRet(o): diff --git a/xmllint.c b/xmllint.c index 41dac2fd..39165fd7 100644 --- a/xmllint.c +++ b/xmllint.c @@ -572,26 +572,24 @@ static int count = 0; static int elem, attrs; static void processNode(xmlTextReaderPtr reader) { - if (debug) { - xmlChar *name, *value; + xmlChar *name, *value; - name = xmlTextReaderName(reader); - if (name == NULL) - name = xmlStrdup(BAD_CAST "--"); - value = xmlTextReaderValue(reader); + name = xmlTextReaderName(reader); + if (name == NULL) + name = xmlStrdup(BAD_CAST "--"); + value = xmlTextReaderValue(reader); - printf("%d %d %d %s", - xmlTextReaderDepth(reader), - xmlTextReaderNodeType(reader), - xmlTextReaderIsEmptyElement(reader), - name); - xmlFree(name); - if (value == NULL) - printf("\n"); - else { - printf(" %s\n", value); - xmlFree(value); - } + printf("%d %d %s %d", + xmlTextReaderDepth(reader), + xmlTextReaderNodeType(reader), + name, + xmlTextReaderIsEmptyElement(reader)); + xmlFree(name); + if (value == NULL) + printf("\n"); + else { + printf(" %s\n", value); + xmlFree(value); } } diff --git a/xmlreader.c b/xmlreader.c index 159447a9..0577334b 100644 --- a/xmlreader.c +++ b/xmlreader.c @@ -602,6 +602,7 @@ xmlTextReaderRead(xmlTextReaderPtr reader) { reader->state = XML_TEXTREADER_ELEMENT; } else { reader->node = reader->ctxt->nodeTab[0]; + reader->state = XML_TEXTREADER_ELEMENT; } reader->depth = 0; goto node_found;