mirror of
				https://gitlab.gnome.org/GNOME/libxml2.git
				synced 2025-10-24 13:33:01 +03:00 
			
		
		
		
	Tue Aug 24 20:49:15 MDT 2004 John Fleck <jfleck@inkstain.net> * doc/tutorial/xmltutorial.xml, xmltutorial.pdf, *.html fix Xpath memory leak (thanks to sKaBoy and William Brack)
		
			
				
	
	
		
			748 lines
		
	
	
		
			28 KiB
		
	
	
	
		
			XML
		
	
	
	
	
	
			
		
		
	
	
			748 lines
		
	
	
		
			28 KiB
		
	
	
	
		
			XML
		
	
	
	
	
	
| <?xml version="1.0"?>
 | |
| <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN"
 | |
|     "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd" [
 | |
| <!ENTITY KEYWORD SYSTEM "includekeyword.c">
 | |
| <!ENTITY XPATH SYSTEM "includexpath.c">
 | |
| <!ENTITY STORY SYSTEM "includestory.xml">
 | |
| <!ENTITY ADDKEYWORD SYSTEM "includeaddkeyword.c">
 | |
| <!ENTITY ADDATTRIBUTE SYSTEM "includeaddattribute.c">
 | |
| <!ENTITY GETATTRIBUTE SYSTEM "includegetattribute.c">
 | |
| <!ENTITY CONVERT SYSTEM "includeconvert.c">
 | |
| ]>
 | |
| <article lang="en">
 | |
|   <articleinfo>
 | |
|     <title>Libxml Tutorial</title>
 | |
|     <author>
 | |
|       <firstname>John</firstname>
 | |
|       <surname>Fleck</surname>
 | |
|       <email>jfleck@inkstain.net</email>
 | |
|     </author>
 | |
|     <copyright>
 | |
|       <year>2002, 2003</year>
 | |
|       <holder>John Fleck</holder>
 | |
|     </copyright>
 | |
|     <revhistory>
 | |
|       <revision>
 | |
| 	<revnumber>1</revnumber>
 | |
| 	<date>June 4, 2002</date>
 | |
| 	<revremark>Initial draft</revremark>
 | |
|       </revision>
 | |
|       <revision>
 | |
| 	<revnumber>2</revnumber>
 | |
| 	<date>June 12, 2002</date>
 | |
| 	<revremark>retrieving attribute value added</revremark>
 | |
|       </revision>
 | |
|       <revision>
 | |
| 	<revnumber>3</revnumber>
 | |
| 	<date>Aug. 31, 2002</date>
 | |
| 	<revremark>freeing memory fix</revremark>
 | |
|       </revision>
 | |
|       <revision>
 | |
| 	<revnumber>4</revnumber>
 | |
| 	<date>Nov. 10, 2002</date>
 | |
| 	<revremark>encoding discussion added</revremark>
 | |
|       </revision>
 | |
|       <revision>
 | |
| 	<revnumber>5</revnumber>
 | |
| 	<date>Dec. 15, 2002</date>
 | |
| 	<revremark>more memory freeing changes</revremark>
 | |
|       </revision>
 | |
|       <revision>
 | |
| 	<revnumber>6</revnumber>
 | |
| 	<date>Jan. 26. 2003</date>
 | |
| 	<revremark>add index</revremark>
 | |
|       </revision>
 | |
|       <revision>
 | |
| 	<revnumber>7</revnumber>
 | |
| 	<date>April 25, 2003</date>
 | |
| 	<revremark>add compilation appendix</revremark>
 | |
|       </revision>
 | |
|       <revision>
 | |
| 	<revnumber>8</revnumber>
 | |
| 	<date>July 24, 2003</date>
 | |
| 	<revremark>add XPath example</revremark>
 | |
|       </revision>
 | |
|       <revision>
 | |
| 	<revnumber>9</revnumber>
 | |
| 	<date>Feb. 14, 2004</date>
 | |
| 	<revremark>Fix bug in XPath example</revremark>
 | |
|       </revision>
 | |
|       <revision>
 | |
| 	<revnumber>7</revnumber>
 | |
| 	<date>Aug. 24, 2004</date>
 | |
| 	<revremark>Fix another bug in XPath example</revremark>
 | |
|       </revision>
 | |
|     </revhistory>
 | |
|   </articleinfo>
 | |
|   <abstract>
 | |
|     <para>Libxml is a freely licensed C language library for handling
 | |
|     <acronym>XML</acronym>, portable across a large number of platforms. This
 | |
|     tutorial provides examples of its basic functions.</para>
 | |
|   </abstract>
 | |
|   <sect1 id="introduction">
 | |
|     <title>Introduction</title>
 | |
|     <para>Libxml is a C language library implementing functions for reading,
 | |
|       creating and manipulating <acronym>XML</acronym> data. This tutorial
 | |
|     provides example code and explanations of its basic functionality.</para>
 | |
|     <para>Libxml and more details about its use are available on <ulink
 | |
| 									url="http://www.xmlsoft.org/">the project home page</ulink>. Included there is complete <ulink url="http://xmlsoft.org/html/libxml-lib.html">
 | |
| 	<acronym>API</acronym> documentation</ulink>. This tutorial is not meant
 | |
|     to substitute for that complete documentation, but to illustrate the
 | |
|     functions needed to use the library to perform basic operations.
 | |
| <!--
 | |
|  Links to
 | |
|       other resources can be found in <xref linkend="furtherresources" />.
 | |
| -->
 | |
| </para>
 | |
|     <para>The tutorial is based on a simple <acronym>XML</acronym> application I
 | |
|     use for articles I write. The format includes metadata and the body
 | |
|     of the article.</para>
 | |
|     <para>The example code in this tutorial demonstrates how to:
 | |
|       <itemizedlist>
 | |
| 	<listitem>
 | |
| 	  <para>Parse the document.</para>
 | |
| 	</listitem>
 | |
| 	<listitem>
 | |
| 	  <para>Extract the text within a specified element.</para>
 | |
| 	</listitem>
 | |
| 	<listitem>
 | |
| 	  <para>Add an element and its content.</para>
 | |
| 	</listitem>
 | |
| 	<listitem>
 | |
| 	  <para>Add an attribute.</para>
 | |
| 	</listitem>      
 | |
| 	<listitem>
 | |
| 	  <para>Extract the value of an attribute.</para>
 | |
| 	</listitem>
 | |
|       </itemizedlist>
 | |
|     </para>
 | |
|     <para>Full code for the examples is included in the appendices.</para>
 | |
| 
 | |
|   </sect1>
 | |
| 
 | |
|   <sect1 id="xmltutorialdatatypes">
 | |
|     <title>Data Types</title>
 | |
|     <para><application>Libxml</application> declares a number of data types we
 | |
|     will encounter repeatedly, hiding the messy stuff so you do not have to deal
 | |
|     with it unless you have some specific need.</para>
 | |
|     <para>
 | |
|       <variablelist>
 | |
| 	<varlistentry>
 | |
| 	  <term><indexterm>
 | |
| 	      <primary>xmlChar</primary>
 | |
| 	    </indexterm>
 | |
| <ulink
 | |
| 	  url="http://xmlsoft.org/html/libxml-tree.html#XMLCHAR">xmlChar</ulink></term>
 | |
| 	  <listitem>
 | |
| 	    <para>A basic replacement for char, a byte in a UTF-8 encoded
 | |
| 	    string. If your data uses another encoding, it must be converted to
 | |
| 	      UTF-8 for use with <application>libxml's</application>
 | |
| 	      functions. More information on encoding is available on the <ulink
 | |
| 		url="http://www.xmlsoft.org/encoding.html"><application>libxml</application> encoding support web page</ulink>.</para>
 | |
| 	  </listitem>
 | |
| 	</varlistentry>
 | |
| 	<varlistentry>
 | |
| 	  <term><indexterm>
 | |
| 	      <primary>xmlDoc</primary>
 | |
| 	    </indexterm>
 | |
| 	    <ulink url="http://xmlsoft.org/html/libxml-tree.html#XMLDOC">xmlDoc</ulink></term>
 | |
| 	  <listitem>
 | |
| 	    <para>A structure containing the tree created by a parsed doc. <ulink
 | |
| 	  url="http://xmlsoft.org/html/libxml-tree.html#XMLDOCPTR">xmlDocPtr</ulink>
 | |
| 	  is a pointer to the structure.</para>
 | |
| 	  </listitem>
 | |
| 	</varlistentry>
 | |
| 	<varlistentry>
 | |
| 	  <term><indexterm>
 | |
| 	      <primary>xmlNodePtr</primary>
 | |
| 	    </indexterm>
 | |
| <ulink
 | |
| 	  url="http://xmlsoft.org/html/libxml-tree.html#XMLNODEPTR">xmlNodePtr</ulink>
 | |
| 	    and <ulink url="http://xmlsoft.org/html/libxml-tree.html#XMLNODE">xmlNode</ulink></term>
 | |
| 	  <listitem>
 | |
| 	    <para>A structure containing a single node. <ulink
 | |
| 	  url="http://xmlsoft.org/html/libxml-tree.html#XMLNODEPTR">xmlNodePtr</ulink>
 | |
| 	  is a pointer to the structure, and is used in traversing the document tree.</para>
 | |
| 	  </listitem>
 | |
| 	</varlistentry>
 | |
|       </variablelist>
 | |
|     </para>
 | |
| 
 | |
|   </sect1>
 | |
| 
 | |
|   <sect1 id="xmltutorialparsing">
 | |
|     <title>Parsing the file</title>
 | |
|     <para><indexterm id="fileparsing" class="startofrange">
 | |
| 	<primary>file</primary>
 | |
| 	<secondary>parsing</secondary>
 | |
|       </indexterm>
 | |
| Parsing the file requires only the name of the file and a single
 | |
|       function call, plus error checking. Full code: <xref
 | |
|     linkend="keywordappendix" /></para>
 | |
|     <para>
 | |
|     <programlisting>
 | |
|         <co id="declaredoc" /> xmlDocPtr doc;
 | |
| 	<co id="declarenode" /> xmlNodePtr cur;
 | |
| 
 | |
| 	<co id="parsefile" /> doc = xmlParseFile(docname);
 | |
| 	
 | |
| 	<co id="checkparseerror" /> if (doc == NULL ) {
 | |
| 		fprintf(stderr,"Document not parsed successfully. \n");
 | |
| 		return;
 | |
| 	}
 | |
| 
 | |
| 	<co id="getrootelement" /> cur = xmlDocGetRootElement(doc);
 | |
| 	
 | |
| 	<co id="checkemptyerror" /> if (cur == NULL) {
 | |
| 		fprintf(stderr,"empty document\n");
 | |
| 		xmlFreeDoc(doc);
 | |
| 		return;
 | |
| 	}
 | |
| 	
 | |
| 	<co id="checkroottype" /> if (xmlStrcmp(cur->name, (const xmlChar *) "story")) {
 | |
| 		fprintf(stderr,"document of the wrong type, root node != story");
 | |
| 		xmlFreeDoc(doc);
 | |
| 		return;
 | |
| 	}
 | |
| 
 | |
|     </programlisting>
 | |
|       <calloutlist>
 | |
| 	<callout arearefs="declaredoc">
 | |
| 	  <para>Declare the pointer that will point to your parsed document.</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="declarenode">
 | |
| 	  <para>Declare a node pointer (you'll need this in order to
 | |
| 	  interact with individual nodes).</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="checkparseerror">
 | |
| 	  <para>Check to see that the document was successfully parsed. If it
 | |
| 	    was not, <application>libxml</application> will at this point
 | |
| 	    register an error and stop. 
 | |
| 	    <note>
 | |
| 	      <para><indexterm>
 | |
| 		  <primary>encoding</primary>
 | |
| 		</indexterm>
 | |
| One common example of an error at this point is improper
 | |
| 	    handling of encoding. The <acronym>XML</acronym> standard requires
 | |
| 	    documents stored with an encoding other than UTF-8 or UTF-16 to
 | |
| 	    contain an explicit declaration of their encoding. If the
 | |
| 	    declaration is there, <application>libxml</application> will
 | |
| 	    automatically perform the necessary conversion to UTF-8 for
 | |
| 		you. More information on <acronym>XML's</acronym> encoding
 | |
| 		requirements is contained in the <ulink
 | |
| 		  url="http://www.w3.org/TR/REC-xml#charencoding">standard</ulink>.</para>
 | |
| 	    </note>
 | |
| 	  </para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="getrootelement">
 | |
| 	  <para>Retrieve the document's root element.</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="checkemptyerror">
 | |
| 	  <para>Check to make sure the document actually contains something.</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="checkroottype">
 | |
| 	  <para>In our case, we need to make sure the document is the right
 | |
| 	  type. "story" is the root type of the documents used in this
 | |
| 	  tutorial.</para>
 | |
| 	</callout>
 | |
|       </calloutlist>
 | |
|       <indexterm startref="fileparsing" class="endofrange" />
 | |
|     </para>
 | |
|   </sect1>
 | |
| 
 | |
|   <sect1 id="xmltutorialgettext">
 | |
|     <title>Retrieving Element Content</title>
 | |
|     <para><indexterm>
 | |
| 	<primary>element</primary>
 | |
| 	<secondary>retrieving content</secondary>
 | |
|       </indexterm>
 | |
| Retrieving the content of an element involves traversing the document
 | |
|     tree until you find what you are looking for. In this case, we are looking
 | |
|     for an element called "keyword" contained within element called "story". The
 | |
|     process to find the node we are interested in involves tediously walking the
 | |
|     tree. We assume you already have an xmlDocPtr called <varname>doc</varname>
 | |
|     and an xmlNodPtr called <varname>cur</varname>.</para>
 | |
| 
 | |
|     <para>
 | |
|       <programlisting>
 | |
| 	<co id="getchildnode" />cur = cur->xmlChildrenNode;
 | |
| 	<co id="huntstoryinfo" />while (cur != NULL) {
 | |
| 		if ((!xmlStrcmp(cur->name, (const xmlChar *)"storyinfo"))){
 | |
| 			parseStory (doc, cur);
 | |
| 		}
 | |
| 		 
 | |
| 	cur = cur->next;
 | |
| 	}
 | |
|       </programlisting>
 | |
| 
 | |
|       <calloutlist>
 | |
| 	<callout arearefs="getchildnode">
 | |
| 	  <para>Get the first child node of <varname>cur</varname>. At this
 | |
| 	    point, <varname>cur</varname> points at the document root, which is
 | |
| 	    the element "story".</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="huntstoryinfo">
 | |
| 	  <para>This loop iterates through the elements that are children of
 | |
| 	  "story", looking for one called "storyinfo". That
 | |
| 	  is the element that will contain the "keywords" we are
 | |
| 	    looking for. It uses the <application>libxml</application> string
 | |
| 	  comparison
 | |
| 	    function, <function><ulink
 | |
| 				       url="http://xmlsoft.org/html/libxml-parser.html#XMLSTRCMP">xmlStrcmp</ulink></function>. If there is a match, it calls the function <function>parseStory</function>.</para>
 | |
| 	</callout>
 | |
|       </calloutlist>
 | |
|     </para>
 | |
| 
 | |
|     <para>
 | |
|       <programlisting>
 | |
| void
 | |
| parseStory (xmlDocPtr doc, xmlNodePtr cur) {
 | |
| 
 | |
| 	xmlChar *key;
 | |
| 	<co id="anothergetchild" /> cur = cur->xmlChildrenNode;
 | |
| 	<co id="findkeyword" /> while (cur != NULL) {
 | |
| 	    if ((!xmlStrcmp(cur->name, (const xmlChar *)"keyword"))) {
 | |
| 	<co id="foundkeyword" />	    key = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
 | |
| 		    printf("keyword: %s\n", key);
 | |
| 		    xmlFree(key);
 | |
|  	    }
 | |
| 	cur = cur->next;
 | |
| 	}
 | |
|     return;
 | |
| }
 | |
|       </programlisting>
 | |
|       <calloutlist>
 | |
| 	<callout arearefs="anothergetchild">
 | |
| 	  <para>Again we get the first child node.</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="findkeyword">
 | |
| 	  <para>Like the loop above, we then iterate through the nodes, looking
 | |
| 	  for one that matches the element we're interested in, in this case
 | |
| 	  "keyword".</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="foundkeyword">
 | |
| 	  <para>When we find the "keyword" element, we need to print
 | |
| 	    its contents. Remember that in <acronym>XML</acronym>, the text
 | |
| 	    contained within an element is a child node of that element, so we
 | |
| 	    turn to <varname>cur->xmlChildrenNode</varname>. To retrieve it, we
 | |
| 	    use the function <function><ulink
 | |
| 					      url="http://xmlsoft.org/html/libxml-tree.html#XMLNODELISTGETSTRING">xmlNodeListGetString</ulink></function>, which also takes the <varname>doc</varname> pointer as an argument. In this case, we just print it out.</para>
 | |
| 	  <note>
 | |
| 	    <para>Because <function>xmlNodeListGetString</function> allocates
 | |
| 	      memory for the string it returns, you must use
 | |
| 	      <function>xmlFree</function> to free it.</para>
 | |
| 	  </note>
 | |
| 	</callout>
 | |
|       </calloutlist>
 | |
|     </para>
 | |
| 
 | |
|   </sect1>
 | |
|   <sect1 id="xmltutorialxpath">
 | |
|     <title>Using XPath to Retrieve Element Content</title>
 | |
|     <para>In addition to walking the document tree to find an element,
 | |
|     <application>Libxml2</application> includes support for
 | |
|       use of <application>XPath</application> expressions to retrieve sets of
 | |
|       nodes that match a specified criteria. Full documentation of the
 | |
|       <application>XPath</application> <acronym>API</acronym> is <ulink
 | |
| 	url="http://xmlsoft.org/html/libxml-xpath.html">here</ulink>.
 | |
|     </para>
 | |
|     <para><application>XPath</application> allows searching through a document
 | |
|     for nodes that match specified criteria. In the example below we search
 | |
|       through a document for the contents of all <varname>keyword</varname>
 | |
|     elements.
 | |
|       <note>
 | |
| 	<para>A full discussion of <application>XPath</application> is beyond
 | |
| 	  the scope of this document. For details on its use, see the <ulink
 | |
| 	    url="http://www.w3.org/TR/xpath">XPath specification</ulink>.</para>
 | |
|       </note>
 | |
|       Full code for this example is at <xref linkend="xpathappendix" />.
 | |
|     </para>
 | |
|     <para>Using <application>XPath</application> requires setting up an
 | |
|       xmlXPathContext and then supplying the <application>XPath</application>
 | |
|       expression and the context to the
 | |
|       <function>xmlXPathEvalExpression</function> function. The function returns
 | |
|       an xmlXPathObjectPtr, which includes the set of nodes satisfying the
 | |
|       <application>XPath</application> expression.</para>
 | |
|     <para>
 | |
|       <programlisting>
 | |
| 	xmlXPathObjectPtr
 | |
| 	getnodeset (xmlDocPtr doc, xmlChar *xpath){
 | |
| 	
 | |
| 	<co id="cocontext" />xmlXPathContextPtr context;
 | |
| 	xmlXPathObjectPtr result;
 | |
| 
 | |
| 	<co id="cocreatecontext" />context = xmlXPathNewContext(doc);
 | |
| 	<co id="corunxpath" />result = xmlXPathEvalExpression(xpath, context);
 | |
| 	<co id="cocheckxpathresult" />if(xmlXPathNodeSetIsEmpty(result->nodesetval)){
 | |
| 		xmlXPathFreeObject(result);
 | |
|                 printf("No result\n");
 | |
| 		return NULL;
 | |
|       </programlisting>
 | |
|       <calloutlist>
 | |
| 	<callout arearefs="cocontext">
 | |
| 	  <para>First we declare our variables.</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="cocreatecontext">
 | |
| 	  <para>Initialize the <varname>context</varname> variable.</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="corunxpath">
 | |
| 	  <para>Apply the <application>XPath</application> expression.</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="cocheckxpathresult">
 | |
| 	  <para>Check the result and free the memory allocated to
 | |
| 	    <varname>result</varname> if no result is found.</para>
 | |
| 	</callout>
 | |
|       </calloutlist>
 | |
|     </para>
 | |
|     <para>The xmlPathObjectPtr returned by the function contains a set of nodes
 | |
|     and other information needed to iterate through the set and act on the
 | |
|       results. For this example, our functions returns the
 | |
|     <varname>xmlXPathObjectPtr</varname>. We use it to print the contents of
 | |
|       <varname>keyword</varname> nodes in our document. The node set object
 | |
|       includes the number of elements in the set (<varname>nodeNr</varname>) and
 | |
|       an array of nodes (<varname>nodeTab</varname>):
 | |
|       <programlisting>
 | |
| 	<co id="conodesetcounter" />for (i=0; i < nodeset->nodeNr; i++) {
 | |
| 	<co id="coprintkeywords" />keyword = xmlNodeListGetString(doc, nodeset->nodeTab[i]->xmlChildrenNode, 1);
 | |
| 		printf("keyword: %s\n", keyword);
 | |
| 	        xmlFree(keyword);
 | |
| 	}
 | |
|       </programlisting>
 | |
|       <calloutlist>
 | |
| 	<callout arearefs="conodesetcounter">
 | |
| 	  <para>The value of <varname>nodeset->Nr</varname> holds the number of
 | |
| 	  elements in the node set. Here we use it to iterate through the array.</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="coprintkeywords">
 | |
| 	  <para>Here we print the contents of each of the nodes returned.
 | |
| 	    <note>
 | |
| 	      <para>Note that we are printing the child node of the node that is
 | |
| 		returned, because the contents of the <varname>keyword</varname>
 | |
| 		element are a child text node.</para>
 | |
| 	    </note>
 | |
| 	  </para>
 | |
| 	</callout>
 | |
|       </calloutlist>
 | |
|     </para>
 | |
|   </sect1>
 | |
| <sect1 id="xmltutorialwritingcontent">
 | |
|     <title>Writing element content</title>
 | |
|     <para><indexterm>
 | |
| 	<primary>element</primary>
 | |
| 	<secondary>writing content</secondary>
 | |
|       </indexterm>
 | |
|       Writing element content uses many of the same steps we used above
 | |
|       — parsing the document and walking the tree. We parse the document,
 | |
|       then traverse the tree to find the place we want to insert our element. For
 | |
|       this example, we want to again find the "storyinfo" element and
 | |
|       this time insert a keyword. Then we'll write the file to disk. Full code:
 | |
|       <xref linkend="addkeywordappendix" /></para>
 | |
|     <para>
 | |
|       The main difference in this example is in
 | |
|       <function>parseStory</function>:
 | |
| 
 | |
|       <programlisting>
 | |
| void
 | |
| parseStory (xmlDocPtr doc, xmlNodePtr cur, char *keyword) {
 | |
| 
 | |
| 	<co id="addkeyword" /> xmlNewTextChild (cur, NULL, "keyword", keyword);
 | |
|     return;
 | |
| }
 | |
|       </programlisting>
 | |
|       <calloutlist>
 | |
| 	<callout arearefs="addkeyword">
 | |
| 	  <para>The <function><ulink
 | |
| 				     url="http://xmlsoft.org/html/libxml-tree.html#XMLNEWTEXTCHILD">xmlNewTextChild</ulink></function>
 | |
| 				     function adds a new child element at the
 | |
| 				     current node pointer's location in the
 | |
| 	    tree, specified by <varname>cur</varname>.</para>
 | |
| 	</callout>
 | |
|       </calloutlist>
 | |
|          </para>
 | |
| 
 | |
|     <para>
 | |
|       <indexterm>
 | |
| 	<primary>file</primary>
 | |
| 	<secondary>saving</secondary>
 | |
|       </indexterm>
 | |
|       Once the node has been added, we would like to write the document to
 | |
|       file. Is you want the element to have a namespace, you can add it here as
 | |
|       well. In our case, the namespace is NULL.
 | |
|       <programlisting>
 | |
| 	xmlSaveFormatFile (docname, doc, 1);
 | |
|       </programlisting>
 | |
|       The first parameter is the name of the file to be written. You'll notice
 | |
|       it is the same as the file we just read. In this case, we just write over
 | |
|       the old file. The second parameter is a pointer to the xmlDoc
 | |
|       structure. Setting the third parameter equal to one ensures indenting on output.
 | |
|     </para>
 | |
|   </sect1>
 | |
| 
 | |
|   <sect1 id="xmltutorialwritingattribute">
 | |
|     <title>Writing Attribute</title>
 | |
|     <para><indexterm>
 | |
| 	<primary>attribute</primary>
 | |
| 	<secondary>writing</secondary>
 | |
|       </indexterm>
 | |
| Writing an attribute is similar to writing text to a new element. In
 | |
|       this case, we'll add a reference <acronym>URI</acronym> to our
 | |
|       document. Full code:<xref linkend="addattributeappendix" />.</para>
 | |
|     <para>
 | |
|       A <sgmltag>reference</sgmltag> is a child of the <sgmltag>story</sgmltag>
 | |
|       element, so finding the place to put our new element and attribute is
 | |
|       simple. As soon as we do the error-checking test in our
 | |
|       <function>parseDoc</function>, we are in the right spot to add our
 | |
|       element. But before we do that, we need to make a declaration using a
 | |
|       data type we have not seen yet:
 | |
|       <programlisting>
 | |
| 	xmlAttrPtr newattr;
 | |
|       </programlisting>
 | |
|       We also need an extra xmlNodePtr:
 | |
|       <programlisting>
 | |
| 	xmlNodePtr newnode;
 | |
|       </programlisting>
 | |
|     </para>
 | |
|     <para>
 | |
|       The rest of <function>parseDoc</function> is the same as before until we
 | |
|       check to see if our root element is <sgmltag>story</sgmltag>. If it is,
 | |
|       then we know we are at the right spot to add our element:
 | |
| 
 | |
|       <programlisting>
 | |
| 	<co id="addreferencenode" /> newnode = xmlNewTextChild (cur, NULL, "reference", NULL);
 | |
| 	<co id="addattributenode" /> newattr = xmlNewProp (newnode, "uri", uri);	
 | |
|       </programlisting>
 | |
|       <calloutlist>
 | |
| 	<callout arearefs="addreferencenode">
 | |
| 	  <para>First we add a new node at the location of the current node
 | |
| 	    pointer, <varname>cur.</varname> using the <ulink
 | |
| 							      url="http://xmlsoft.org/html/libxml-tree.html#XMLNEWTEXTCHILD">xmlNewTextChild</ulink> function.</para>
 | |
| 	</callout>
 | |
|       </calloutlist>
 | |
|    </para>
 | |
| 
 | |
|     <para>Once the node is added, the file is written to disk just as in the
 | |
|     previous example in which we added an element with text content.</para>
 | |
| 
 | |
|   </sect1>
 | |
| 
 | |
|   <sect1 id="xmltutorialattribute">
 | |
|     <title>Retrieving Attributes</title>
 | |
|     <para><indexterm>
 | |
| 	<primary>attribute</primary>
 | |
| 	<secondary>retrieving value</secondary>
 | |
|       </indexterm>
 | |
| Retrieving the value of an attribute is similar to the previous
 | |
|     example in which we retrieved a node's text contents. In this case we'll
 | |
|       extract the value of the <acronym>URI</acronym> we added in the previous
 | |
|       section. Full code: <xref linkend="getattributeappendix" />.</para>
 | |
|     <para>
 | |
|       The initial steps for this example are similar to the previous ones: parse
 | |
|       the doc, find the element you are interested in, then enter a function to
 | |
|       carry out the specific task required. In this case, we call
 | |
|       <function>getReference</function>:
 | |
|       <programlisting>
 | |
| void
 | |
| getReference (xmlDocPtr doc, xmlNodePtr cur) {
 | |
| 
 | |
| 	xmlChar *uri;
 | |
| 	cur = cur->xmlChildrenNode;
 | |
| 	while (cur != NULL) {
 | |
| 	    if ((!xmlStrcmp(cur->name, (const xmlChar *)"reference"))) {
 | |
| 		   <co id="getattributevalue" /> uri = xmlGetProp(cur, "uri");
 | |
| 		    printf("uri: %s\n", uri);
 | |
| 		    xmlFree(uri);
 | |
| 	    }
 | |
| 	    cur = cur->next;
 | |
| 	}
 | |
| 	return;
 | |
| }
 | |
|       </programlisting>
 | |
|     
 | |
|       <calloutlist>
 | |
| 	<callout arearefs="getattributevalue">
 | |
| 	  <para>
 | |
| 	    The key function is <function><ulink
 | |
| 					   url="http://xmlsoft.org/html/libxml-tree.html#XMLGETPROP">xmlGetProp</ulink></function>, which returns an
 | |
|       <varname>xmlChar</varname> containing the attribute's value. In this case,
 | |
| 					   we just print it out.
 | |
|       <note>
 | |
| 	<para>
 | |
| 	  If you are using a <acronym>DTD</acronym> that declares a fixed or
 | |
| 	  default value for the attribute, this function will retrieve it.
 | |
| 	</para>
 | |
| 	    </note>
 | |
| 	  </para>
 | |
| 	</callout>
 | |
|       </calloutlist>
 | |
|      
 | |
|     </para>
 | |
|   </sect1>
 | |
| 
 | |
|   <sect1 id="xmltutorialconvert">
 | |
|     <title>Encoding Conversion</title>
 | |
| 
 | |
|     <para><indexterm>
 | |
| 	<primary>encoding</primary>
 | |
|       </indexterm>
 | |
| Data encoding compatibility problems are one of the most common
 | |
|       difficulties encountered by programmers new to <acronym>XML</acronym> in
 | |
|       general and <application>libxml</application> in particular. Thinking
 | |
|       through the design of your application in light of this issue will help
 | |
|       avoid difficulties later. Internally, <application>libxml</application>
 | |
|       stores and manipulates data in the UTF-8 format. Data used by your program
 | |
|       in other formats, such as the commonly used ISO-8859-1 encoding, must be
 | |
|       converted to UTF-8 before passing it to <application>libxml</application>
 | |
|       functions. If you want your program's output in an encoding other than
 | |
|       UTF-8, you also must convert it.</para>
 | |
| 
 | |
|       <para><application>Libxml</application> uses
 | |
|       <application>iconv</application> if it is available to convert
 | |
|     data. Without <application>iconv</application>, only UTF-8, UTF-16 and
 | |
|     ISO-8859-1 can be used as external formats. With
 | |
|     <application>iconv</application>, any format can be used provided
 | |
|     <application>iconv</application> is able to convert it to and from
 | |
|     UTF-8. Currently <application>iconv</application> supports about 150
 | |
|     different character formats with ability to convert from any to any. While
 | |
|     the actual number of supported formats varies between implementations, every
 | |
|     <application>iconv</application> implementation is almost guaranteed to
 | |
|     support every format anyone has ever heard of.</para>
 | |
| 
 | |
|     <warning>
 | |
|       <para>A common mistake is to use different formats for the internal data
 | |
| 	in different parts of one's code. The most common case is an application
 | |
| 	that assumes ISO-8859-1 to be the internal data format, combined with
 | |
| 	<application>libxml</application>, which assumes UTF-8 to be the
 | |
| 	internal data format. The result is an application that treats internal
 | |
| 	data differently, depending on which code section is executing. The one or
 | |
| 	the other part of code will then, naturally, misinterpret the data.
 | |
|       </para>
 | |
|     </warning>
 | |
| 
 | |
|     <para>This example constructs a simple document, then adds content provided
 | |
|     at the command line to the document's root element and outputs the results
 | |
|     to <filename>stdout</filename> in the proper encoding. For this example, we
 | |
|     use ISO-8859-1 encoding. The encoding of the string input at the command
 | |
|     line is converted from ISO-8859-1 to UTF-8. Full code: <xref
 | |
|     linkend="convertappendix" /></para>
 | |
| 
 | |
|     <para>The conversion, encapsulated in the example code in the
 | |
|       <function>convert</function> function, uses
 | |
|       <application>libxml's</application>
 | |
|     <function>xmlFindCharEncodingHandler</function> function:
 | |
|       <programlisting>
 | |
| 	<co id="handlerdatatype" />xmlCharEncodingHandlerPtr handler;
 | |
|         <co id="calcsize" />size = (int)strlen(in)+1; 
 | |
|         out_size = size*2-1; 
 | |
|         out = malloc((size_t)out_size); 
 | |
| 
 | |
| …
 | |
| 	<co id="findhandlerfunction" />handler = xmlFindCharEncodingHandler(encoding);
 | |
| …
 | |
| 	<co id="callconversionfunction" />handler->input(out, &out_size, in, &temp);
 | |
| …	
 | |
| 	<co id="outputencoding" />xmlSaveFormatFileEnc("-", doc, encoding, 1);
 | |
|       </programlisting>
 | |
|       <calloutlist>
 | |
| 	<callout arearefs="handlerdatatype">
 | |
| 	  <para><varname>handler</varname> is declared as a pointer to an
 | |
| 	    <function>xmlCharEncodingHandler</function> function.</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="calcsize">
 | |
| 	  <para>The <function>xmlCharEncodingHandler</function> function needs
 | |
| 	  to be given the size of the input and output strings, which are
 | |
| 	    calculated here for strings <varname>in</varname> and
 | |
| 	  <varname>out</varname>.</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="findhandlerfunction">
 | |
| 	  <para><function>xmlFindCharEncodingHandler</function> takes as its
 | |
| 	    argument the data's initial encoding and searches
 | |
| 	    <application>libxml's</application> built-in set of conversion
 | |
| 	    handlers, returning a pointer to the function or NULL if none is
 | |
| 	    found.</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="callconversionfunction">
 | |
| 	  <para>The conversion function identified by <varname>handler</varname>
 | |
| 	  requires as its arguments pointers to the input and output strings,
 | |
| 	  along with the length of each. The lengths must be determined
 | |
| 	  separately by the application.</para>
 | |
| 	</callout>
 | |
| 	<callout arearefs="outputencoding">
 | |
| 	  <para>To output in a specified encoding rather than UTF-8, we use
 | |
| 	    <function>xmlSaveFormatFileEnc</function>, specifying the
 | |
| 	    encoding.</para>
 | |
| 	</callout>
 | |
|       </calloutlist>
 | |
|     </para>
 | |
|   </sect1>
 | |
| 
 | |
|   <appendix id="compilation">
 | |
|     <title>Compilation</title>
 | |
|     <para><indexterm>
 | |
| 	<primary>compiler flags</primary>
 | |
|       </indexterm>
 | |
|       <application>Libxml</application> includes a script,
 | |
|     <application>xml2-config</application>, that can be used to generate
 | |
|     flags for compilation and linking of programs written with the
 | |
|       library. For pre-processor and compiler flags, use <command>xml2-config
 | |
| 	--cflags</command>. For library linking flags, use <command>xml2-config
 | |
| 	--libs</command>. Other options are available using <command>xml2-config
 | |
|     --help</command>.</para>   
 | |
|   </appendix>
 | |
| 
 | |
|   <appendix id="sampledoc">
 | |
|     <title>Sample Document</title>
 | |
|     <programlisting>&STORY;</programlisting>
 | |
|   </appendix>
 | |
|   <appendix id="keywordappendix">
 | |
|     <title>Code for Keyword Example</title>
 | |
|     <para>
 | |
|       <programlisting>&KEYWORD;</programlisting>
 | |
|     </para>
 | |
|   </appendix>
 | |
|   <appendix id="xpathappendix">
 | |
|     <title>Code for XPath Example</title>
 | |
|     <para>
 | |
|       <programlisting>&XPATH;</programlisting>
 | |
|     </para>
 | |
|   </appendix>
 | |
| <appendix id="addkeywordappendix">
 | |
|     <title>Code for Add Keyword Example</title>
 | |
|     <para>
 | |
|       <programlisting>&ADDKEYWORD;</programlisting>
 | |
|     </para>
 | |
|   </appendix>
 | |
| <appendix id="addattributeappendix">
 | |
|     <title>Code for Add Attribute Example</title>
 | |
|     <para>
 | |
|       <programlisting>&ADDATTRIBUTE;</programlisting>
 | |
|     </para>
 | |
|   </appendix>
 | |
| <appendix id="getattributeappendix">
 | |
|     <title>Code for Retrieving Attribute Value Example</title>
 | |
|     <para>
 | |
|       <programlisting>&GETATTRIBUTE;</programlisting>
 | |
|     </para>
 | |
|   </appendix>
 | |
|   <appendix id="convertappendix">
 | |
|     <title>Code for Encoding Conversion Example</title>
 | |
|     <para>
 | |
|       <programlisting>&CONVERT;</programlisting>
 | |
|     </para>
 | |
|   </appendix>
 | |
|   <appendix>
 | |
|     <title>Acknowledgements</title>
 | |
|     <para>A number of people have generously offered feedback, code and
 | |
|     suggested improvements to this tutorial. In no particular order:
 | |
|       <simplelist type="inline">
 | |
| 	<member>Daniel Veillard</member>
 | |
| 	<member>Marcus Labib Iskander</member>
 | |
| 	<member>Christopher R. Harris</member>
 | |
| 	<member>Igor Zlatkovic</member>
 | |
| 	<member>Niraj Tolia</member>
 | |
| 	<member>David Turover</member>
 | |
|       </simplelist>
 | |
|     </para>
 | |
|   </appendix>
 | |
|   <index />
 | |
| </article>
 |