From be803967dbecb5534c7c7fbc1a17157ba43366b5 Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Wed, 28 Jun 2000 23:40:59 +0000 Subject: [PATCH] - Large resync between W3C and Gnome tree - configure.in: 2.1.0 prerelease - example/Makefile.am example/gjobread.c tree.h: work on libxml1 libxml2 convergence. - nanoftp, nanohttp.c: fixed stalled connections probs - HTMLtree.c SAX.c : support for attribute without values in HTML for andersca - valid.c: Fixed most validation + namespace problems - HTMLparser.c: start document callback for andersca - debugXML.c xpath.c: lots of XPath fixups from Picdar Technology - parser.h, SAX.c: serious speed improvement for large CDATA blocks - encoding.[ch] xmlIO.[ch]: Improved seriously saving to different encoding - config.h.in parser.c xmllint.c: added xmlCheckVersion() and the LIBXML_TEST_VERSION macro Daniel --- ChangeLog | 19 + HTMLparser.c | 64 +- HTMLtree.c | 16 +- Makefile.am | 6 +- README | 2 + SAX.c | 178 ++++- TODO | 15 +- debugXML.c | 2 +- doc/html/gnome-xml-tree.html | 1310 -------------------------------- encoding.c | 406 +++++++++- encoding.h | 6 + entities.c | 80 ++ entities.h | 2 + example/Makefile.am | 6 +- example/gjobread.c | 39 +- include/libxml/encoding.h | 6 + include/libxml/entities.h | 2 + include/libxml/parser.h | 10 +- include/libxml/tree.h | 32 +- include/libxml/uri.h | 2 + include/libxml/xmlIO.h | 79 +- include/libxml/xmlversion.h.in | 6 + include/libxml/xpath.h | 6 +- nanoftp.c | 10 +- nanohttp.c | 17 + parser.c | 197 +++-- parser.h | 10 +- result/HTML/test3.html | 2 + result/HTML/wired.html | 4 +- testURI.c | 2 + tree.c | 736 +++++++++++++++++- tree.h | 32 +- uri.c | 5 +- uri.h | 2 + valid.c | 331 +++++++- xmlIO.c | 627 ++++++++++++++- xmlIO.h | 79 +- xmllint.c | 14 + xmlversion.h.in | 6 + xpath.c | 65 +- xpath.h | 6 +- 41 files changed, 2877 insertions(+), 1562 deletions(-) diff --git a/ChangeLog b/ChangeLog index cc6b43c5..37094c46 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +Wed Jun 28 23:10:26 MEST 2000 Daniel Veillard + + * configure.in: 2.1.0 prerelease + * Large resync between W3C and Gnome tree + * nanoftp, nanohttp.c: fixed stalled connections probs + * HTMLtree.c SAX.c : support for attribute without values in + HTML for andersca + * valid.c: Fixed most validation + namespace problems + * HTMLparser.c: start document callback for andersca + * debugXML.c xpath.c: lots of XPath fixups from Picdar Technology + * parser.h, SAX.c: serious speed improvement for large + CDATA blocks + * encoding.[ch] xmlIO.[ch]: Improved seriously saving to + different encoding + * example/Makefile.am example/gjobread.c tree.h: work on + libxml1 libxml2 convergence. + * config.h.in parser.c xmllint.c: added xmlCheckVersion() + and the LIBXML_TEST_VERSION macro + Fri Jun 23 22:26:07 CEST 2000 Daniel Veillard * doc/xml.html: various patches and improvements typo fixed by diff --git a/HTMLparser.c b/HTMLparser.c index 5c1cfaee..cfd65bd7 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -388,6 +388,7 @@ char *htmlStartClose[] = { NULL }; + static char** htmlStartCloseIndex[100]; static int htmlStartCloseIndexinitialized = 0; @@ -604,6 +605,54 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) { } } +/** + * htmlCheckImplied: + * @ctxt: an HTML parser context + * @new: The new tag name + * + * The HTmL DtD allows a tag to exists only implicitely + * called when a new tag has been detected and generates the + * appropriates implicit tags if missing + */ +void +htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *new) { + if (!strcmp(new, "html")) + return; + if (ctxt->nameNr <= 0) { +#ifdef DEBUG + fprintf(stderr,"Implied element html: pushed html\n"); +#endif + htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html")); + if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) + ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); + } + if ((!strcmp(new, "body")) || (!strcmp(new, "head"))) + return; + if (ctxt->nameNr <= 1) { + if ((!strcmp(new, "script")) || (!strcmp(new, "style")) || + (!strcmp(new, "meta")) || (!strcmp(new, "link")) || + (!strcmp(new, "title")) || (!strcmp(new, "base"))) { + /* + * dropped OBJECT ... i you put it first BODY will be + * assumed ! + */ +#ifdef DEBUG + fprintf(stderr,"Implied element head: pushed head\n"); +#endif + htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head")); + if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) + ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); + } else { +#ifdef DEBUG + fprintf(stderr,"Implied element body: pushed body\n"); +#endif + htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body")); + if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) + ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); + } + } +} + /************************************************************************ * * * The list of HTML predefined entities * @@ -1322,6 +1371,7 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); else xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI); + cur->doc = cur; cur->name = NULL; cur->children = NULL; cur->extSubset = NULL; @@ -2161,11 +2211,12 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { NEXT; SKIP_BLANKS; val = htmlParseAttValue(ctxt); + /****** } else { - /* TODO : some attribute must have values, some may not */ + * TODO : some attribute must have values, some may not if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->warning(ctxt->userData, - "No value for attribute %s\n", name); + "No value for attribute %s\n", name); */ } *value = val; @@ -2219,6 +2270,11 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { */ htmlAutoClose(ctxt, name); + /* + * Check for implied HTML elements. + */ + htmlCheckImplied(ctxt, name); + /* * Now parse the attributes, it ends up with the ending * @@ -2759,6 +2815,10 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { ctxt->wellFormed = 0; } + if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) + ctxt->sax->startDocument(ctxt->userData); + + /* * Parse possible comments before any content */ diff --git a/HTMLtree.c b/HTMLtree.c index e4a955cc..24a90ba7 100644 --- a/HTMLtree.c +++ b/HTMLtree.c @@ -84,13 +84,15 @@ htmlAttrDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) { } xmlBufferWriteChar(buf, " "); xmlBufferWriteCHAR(buf, cur->name); - value = xmlNodeListGetString(doc, cur->children, 0); - if (value) { - xmlBufferWriteChar(buf, "="); - xmlBufferWriteQuotedString(buf, value); - xmlFree(value); - } else { - xmlBufferWriteChar(buf, "=\"\""); + if (cur->children != NULL) { + value = xmlNodeListGetString(doc, cur->children, 0); + if (value) { + xmlBufferWriteChar(buf, "="); + xmlBufferWriteQuotedString(buf, value); + xmlFree(value); + } else { + xmlBufferWriteChar(buf, "=\"\""); + } } } diff --git a/Makefile.am b/Makefile.am index 74624daf..3f4b0038 100644 --- a/Makefile.am +++ b/Makefile.am @@ -86,12 +86,14 @@ testURI_LDADD= $(LDADDS) check-local: tests -install-data: $(srcdir)/libxml - $(srcdir)/libxml: -$(RM) $(srcdir)/libxml ln -s $(srcdir)/. $(srcdir)/libxml +install-data: $(srcdir)/libxml + +$(libxml_la_SOURCES): $(srcdir)/libxml + testall : tests SVGtests SAXtests XPathtests XMLenttests tests: XMLtests HTMLtests Validtests diff --git a/README b/README index 2746bfea..cad4d773 100644 --- a/README +++ b/README @@ -4,6 +4,8 @@ Full documentation is available on-line at http://xmlsoft.org/ +This code is released under the LGPL and the W3C IPR + A mailing-list is available, to subscribe: echo "subscribe xml" | mail majordomo@rufus.w3.org diff --git a/SAX.c b/SAX.c index dace3058..ee3af393 100644 --- a/SAX.c +++ b/SAX.c @@ -24,6 +24,7 @@ #include #include #include +#include /* #define DEBUG_SAX */ /* #define DEBUG_SAX_TREE */ @@ -193,6 +194,7 @@ externalSubset(void *ctx, const xmlChar *name, int oldwellFormed; xmlParserInputPtr input = NULL; xmlCharEncoding enc; + xmlCharEncoding oldcharset; /* * Ask the Entity resolver to load the damn thing @@ -214,6 +216,7 @@ externalSubset(void *ctx, const xmlChar *name, oldinputMax = ctxt->inputMax; oldinputTab = ctxt->inputTab; oldwellFormed = ctxt->wellFormed; + oldcharset = ctxt->charset; ctxt->inputTab = (xmlParserInputPtr *) xmlMalloc(5 * sizeof(xmlParserInputPtr)); @@ -227,6 +230,7 @@ externalSubset(void *ctx, const xmlChar *name, ctxt->inputNr = oldinputNr; ctxt->inputMax = oldinputMax; ctxt->inputTab = oldinputTab; + ctxt->charset = oldcharset; return; } ctxt->inputNr = 0; @@ -269,6 +273,7 @@ externalSubset(void *ctx, const xmlChar *name, ctxt->inputNr = oldinputNr; ctxt->inputMax = oldinputMax; ctxt->inputTab = oldinputTab; + ctxt->charset = oldcharset; /* ctxt->wellFormed = oldwellFormed; */ } } @@ -604,6 +609,14 @@ endDocument(void *ctx) ctxt->myDoc->encoding = ctxt->encoding; ctxt->encoding = NULL; } + if ((ctxt->inputTab[0]->encoding != NULL) && (ctxt->myDoc != NULL) && + (ctxt->myDoc->encoding == NULL)) { + ctxt->myDoc->encoding = xmlStrdup(ctxt->inputTab[0]->encoding); + } + if ((ctxt->charset != XML_CHAR_ENCODING_NONE) && (ctxt->myDoc != NULL) && + (ctxt->myDoc->charset == XML_CHAR_ENCODING_NONE)) { + ctxt->myDoc->charset = ctxt->charset; + } } /** @@ -640,7 +653,10 @@ attribute(void *ctx, const xmlChar *fullname, const xmlChar *value) /* * Do the last stave of the attribute normalization */ - nval = xmlValidNormalizeAttributeValue(ctxt->myDoc, + if (ctxt->html) + nval = NULL; + else + nval = xmlValidNormalizeAttributeValue(ctxt->myDoc, ctxt->node, fullname, value); if (nval != NULL) value = nval; @@ -648,9 +664,25 @@ attribute(void *ctx, const xmlChar *fullname, const xmlChar *value) /* * Check whether it's a namespace definition */ - if ((ns == NULL) && + if ((!ctxt->html) && (ns == NULL) && (name[0] == 'x') && (name[1] == 'm') && (name[2] == 'l') && (name[3] == 'n') && (name[4] == 's') && (name[5] == 0)) { + xmlURIPtr uri; + + uri = xmlParseURI((const char *)value); + if (uri == NULL) { + if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL)) + ctxt->sax->warning(ctxt->userData, + "nmlns: %s not a valid URI\n", value); + } else { + if (uri->scheme == NULL) { + if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL)) + ctxt->sax->warning(ctxt->userData, + "nmlns: URI %s is not absolute\n", value); + } + xmlFreeURI(uri); + } + /* a default namespace definition */ xmlNewNs(ctxt->node, value, NULL); if (name != NULL) @@ -659,7 +691,8 @@ attribute(void *ctx, const xmlChar *fullname, const xmlChar *value) xmlFree(nval); return; } - if ((ns != NULL) && (ns[0] == 'x') && (ns[1] == 'm') && (ns[2] == 'l') && + if ((!ctxt->html) && + (ns != NULL) && (ns[0] == 'x') && (ns[1] == 'm') && (ns[2] == 'l') && (ns[3] == 'n') && (ns[4] == 's') && (ns[5] == 0)) { /* * Validate also for namespace decls, they are attributes from @@ -701,7 +734,7 @@ attribute(void *ctx, const xmlChar *fullname, const xmlChar *value) ret->last = tmp; tmp = tmp->next; } - } else { + } else if (value != NULL) { ret->children = xmlNewDocText(ctxt->myDoc, value); ret->last = ret->children; if (ret->children != NULL) @@ -709,7 +742,7 @@ attribute(void *ctx, const xmlChar *fullname, const xmlChar *value) } } - if (ctxt->validate && ctxt->wellFormed && + if ((!ctxt->html) && ctxt->validate && ctxt->wellFormed && ctxt->myDoc && ctxt->myDoc->intSubset) { /* @@ -817,6 +850,7 @@ startElement(void *ctx, const xmlChar *fullname, const xmlChar **atts) } else if (parent == NULL) { parent = ctxt->myDoc->children; } + ctxt->nodemem = -1; /* * We are parsing a new node. @@ -844,15 +878,6 @@ startElement(void *ctx, const xmlChar *fullname, const xmlChar **atts) } } - /* - * If it's the Document root, finish the Dtd validation and - * check the document root element for validity - */ - if ((ctxt->validate) && (ctxt->vctxt.finishDtd == 0)) { - ctxt->valid &= xmlValidateDtdFinal(&ctxt->vctxt, ctxt->myDoc); - ctxt->valid &= xmlValidateRoot(&ctxt->vctxt, ctxt->myDoc); - ctxt->vctxt.finishDtd = 1; - } /* * process all the attributes whose name start with "xml" */ @@ -860,31 +885,14 @@ startElement(void *ctx, const xmlChar *fullname, const xmlChar **atts) i = 0; att = atts[i++]; value = atts[i++]; - while ((att != NULL) && (value != NULL)) { - if ((att[0] == 'x') && (att[1] == 'm') && (att[2] == 'l')) - attribute(ctxt, att, value); + if (!ctxt->html) { + while ((att != NULL) && (value != NULL)) { + if ((att[0] == 'x') && (att[1] == 'm') && (att[2] == 'l')) + attribute(ctxt, att, value); - att = atts[i++]; - value = atts[i++]; - } - } - - /* - * process all the other attributes - */ - if (atts != NULL) { - i = 0; - att = atts[i++]; - value = atts[i++]; - while ((att != NULL) && (value != NULL)) { - if ((att[0] != 'x') || (att[1] != 'm') || (att[2] != 'l')) - attribute(ctxt, att, value); - - /* - * Next ones - */ - att = atts[i++]; - value = atts[i++]; + att = atts[i++]; + value = atts[i++]; + } } } @@ -897,6 +905,43 @@ startElement(void *ctx, const xmlChar *fullname, const xmlChar **atts) ns = xmlSearchNs(ctxt->myDoc, parent, prefix); xmlSetNs(ret, ns); + /* + * process all the other attributes + */ + if (atts != NULL) { + i = 0; + att = atts[i++]; + value = atts[i++]; + if (ctxt->html) { + while (att != NULL) { + attribute(ctxt, att, value); + att = atts[i++]; + value = atts[i++]; + } + } else { + while ((att != NULL) && (value != NULL)) { + if ((att[0] != 'x') || (att[1] != 'm') || (att[2] != 'l')) + attribute(ctxt, att, value); + + /* + * Next ones + */ + att = atts[i++]; + value = atts[i++]; + } + } + } + + /* + * If it's the Document root, finish the Dtd validation and + * check the document root element for validity + */ + if ((ctxt->validate) && (ctxt->vctxt.finishDtd == 0)) { + ctxt->valid &= xmlValidateDtdFinal(&ctxt->vctxt, ctxt->myDoc); + ctxt->valid &= xmlValidateRoot(&ctxt->vctxt, ctxt->myDoc); + ctxt->vctxt.finishDtd = 1; + } + if (prefix != NULL) xmlFree(prefix); if (name != NULL) @@ -932,6 +977,7 @@ endElement(void *ctx, const xmlChar *name) node_info.node = cur; xmlParserAddNodeInfo(ctxt, &node_info); } + ctxt->nodemem = -1; if (ctxt->validate && ctxt->wellFormed && ctxt->myDoc && ctxt->myDoc->intSubset) @@ -1008,14 +1054,62 @@ characters(void *ctx, const xmlChar *ch, int len) #ifdef DEBUG_SAX_TREE fprintf(stderr, "add chars to %s \n", ctxt->node->name); #endif - if (lastChild == NULL) + + /* + * Here we needed an accelerator mechanism in case of very large + * elements. Use an attribute in the structure !!! + */ + if (lastChild == NULL) { + /* first node, first time */ xmlNodeAddContentLen(ctxt->node, ch, len); - else { - if (xmlNodeIsText(lastChild)) +#ifndef XML_USE_BUFFER_CONTENT + if (ctxt->node->children != NULL) { + ctxt->nodelen = len; + ctxt->nodemem = len + 1; + } +#endif + } else { + if (xmlNodeIsText(lastChild)) { +#ifndef XML_USE_BUFFER_CONTENT + /* + * The whole point of maintaining nodelen and nodemem, + * xmlTextConcat is too costly, i.e. compute lenght, + * reallocate a new buffer, move data, append ch. Here + * We try to minimaze realloc() uses and avoid copying + * and recomputing lenght over and over. + */ + if (ctxt->nodelen + len >= ctxt->nodemem) { + xmlChar *newbuf; + int size; + + size = ctxt->nodemem + len; + size *= 2; + newbuf = (xmlChar *) xmlRealloc(lastChild->content,size); + if (newbuf == NULL) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "SAX.characters(): out of memory\n"); + return; + } + ctxt->nodemem = size; + lastChild->content = newbuf; + } + memcpy(&lastChild->content[ctxt->nodelen], ch, len); + ctxt->nodelen += len; + lastChild->content[ctxt->nodelen] = 0; +#else xmlTextConcat(lastChild, ch, len); - else { +#endif + } else { + /* Mixed content, first time */ lastChild = xmlNewTextLen(ch, len); xmlAddChild(ctxt->node, lastChild); +#ifndef XML_USE_BUFFER_CONTENT + if (ctxt->node->children != NULL) { + ctxt->nodelen = len; + ctxt->nodemem = len + 1; + } +#endif } } } diff --git a/TODO b/TODO index 2b4ae5ee..8545e1a6 100644 --- a/TODO +++ b/TODO @@ -6,14 +6,17 @@ TODO: ===== -- xmlSwitchToEncoding() need a rewrite for correct handling of conversion - error code conditions. +- If the internal encoding is not UTF8 saving to a given encoding doesn't + work +- problem when parsing hrefs with & with the HTML parser (IRC ac) - DOM needs xmlAttrPtr xmlNewDocProp(xmlDocPtr doc, const xmlChar *name, const xmlChar *value) int xmlPruneProp(xmlNodePtr node, xmlAtttrPtr attr); +- General checking of DTD validation in presence of namespaces ... hairy + mostly done - Fix DTD + namespace validity problem "Not valid: root and DtD name do not match 'ROOT' and 'prefix:ROOT'" -- add support for the trick from Henry conf/sun/valid/empty.xml + mostly done - Correct standalone checking/emitting (hard) 2.9 Standalone Document Declaration - Better checking of external parsed entities TAG 1234 @@ -24,9 +27,10 @@ TODO: - Handle undefined namespaces in entity contents better ... at least issue a warning - Issue warning when using non-absolute namespaces URI. -- General checking of DTD validation in presence of namespaces ... hairy - fix --disable-corba configure switch handling, and use XML_WITHOUT_CORBA not WITHOUT_CORBA flag +- the html parser should add and if they don't exist +- Command to force the parser to stop parsing and ignore the rest of the file. TODO: ===== @@ -92,6 +96,9 @@ EXTENSIONS: Done: ===== +- support for HTML empty attributes like
+- plugged iconv() in for support of a large set of encodings. +- xmlSwitchToEncoding() rewrite done - URI checkings (no fragments) rfc2396.txt - Added a clean mechanism for overload or added input methods: xmlRegisterInputCallbacks() diff --git a/debugXML.c b/debugXML.c index 6a271b80..88158d8d 100644 --- a/debugXML.c +++ b/debugXML.c @@ -1326,7 +1326,7 @@ xmlShellDu(xmlShellCtxtPtr ctxt, char *arg, xmlNodePtr tree, if ((node->type == XML_DOCUMENT_NODE) || (node->type == XML_HTML_DOCUMENT_NODE)) { node = ((xmlDocPtr) node)->children; - } else if (node->children != NULL) { + } else if ((node->children != NULL) && (node->type != XML_ENTITY_REF_NODE)) { /* deep first */ node = node->children; indent++; diff --git a/doc/html/gnome-xml-tree.html b/doc/html/gnome-xml-tree.html index e3349886..c1a4e984 100644 --- a/doc/html/gnome-xml-tree.html +++ b/doc/html/gnome-xml-tree.html @@ -8137,1314 +8137,4 @@ HREF="gnome-xml-tree.html#XMLCHAR" >xmlChar* xmlNodeGetContent (xmlNodePtr cur);

Read the value of a node, this can be either the text carried -directly by this node if it's a TEXT node or the aggregate string -of the values carried by this node child's (TEXT and ENTITY_REF). -Entity references are substitued.

cur : 
Returns : 


xmlNodeGetLang ()

xmlChar*    xmlNodeGetLang                  (xmlNodePtr cur);

Searches the language of a node, i.e. the values of the xml:lang -attribute or the one carried by the nearest ancestor.

cur : 
Returns : 


xmlNodeSetLang ()

void        xmlNodeSetLang                  (xmlNodePtr cur,
-                                             const xmlChar *lang);

Set the language of a node, i.e. the values of the xml:lang -attribute.

cur : 
lang : 


xmlRemoveProp ()

int         xmlRemoveProp                   (xmlAttrPtr attr);

Unlink and free one attribute, all the content is freed too -Note this doesn't work for namespace definition attributes

attr : 
Returns : 


xmlRemoveNode ()

int         xmlRemoveNode                   (xmlNodePtr node);

node : 
Returns : 


xmlBufferWriteCHAR ()

void        xmlBufferWriteCHAR              (xmlBufferPtr buf,
-                                             const xmlChar *string);

routine which manage and grows an output buffer. This one add -xmlChars at the end of the buffer.

buf : 
string : 


xmlBufferWriteChar ()

void        xmlBufferWriteChar              (xmlBufferPtr buf,
-                                             const char *string);

routine which manage and grows an output buffer. This one add -C chars at the end of the array.

buf : 
string : 


xmlBufferWriteQuotedString ()

void        xmlBufferWriteQuotedString      (xmlBufferPtr buf,
-                                             const xmlChar *string);

routine which manage and grows an output buffer. This one writes -a quoted or double quoted xmlChar string, checking first if it holds -quote or double-quotes internally

buf : 
string : 


xmlDocDumpMemory ()

void        xmlDocDumpMemory                (xmlDocPtr cur,
-                                             xmlChar **mem,
-                                             int *size);

Dump an XML document in memory and return the xmlChar * and it's size. -It's up to the caller to free the memory.

cur : 
mem : 
size : 


xmlDocDump ()

void        xmlDocDump                      (FILE *f,
-                                             xmlDocPtr cur);

Dump an XML document to an open FILE.

f : 
cur : 


xmlSaveFile ()

int         xmlSaveFile                     (const char *filename,
-                                             xmlDocPtr cur);

Dump an XML document to a file. Will use compression if -compiled in and enabled. If filename is "-" the stdout file is -used.

filename : 
cur : 
Returns : 


xmlGetDocCompressMode ()

int         xmlGetDocCompressMode           (xmlDocPtr doc);

get the compression ratio for a document, ZLIB based

doc : 
Returns : 


xmlSetDocCompressMode ()

void        xmlSetDocCompressMode           (xmlDocPtr doc,
-                                             int mode);

set the compression ratio for a document, ZLIB based -Correct values: 0 (uncompressed) to 9 (max compression)

doc : 
mode : 


xmlGetCompressMode ()

int         xmlGetCompressMode              (void);

get the default compression mode used, ZLIB based.

Returns : 


xmlSetCompressMode ()

void        xmlSetCompressMode              (int mode);

set the default compression mode used, ZLIB based -Correct values: 0 (uncompressed) to 9 (max compression)

mode : 

\ No newline at end of file diff --git a/encoding.c b/encoding.c index 1a4c157f..627f4682 100644 --- a/encoding.c +++ b/encoding.c @@ -3,6 +3,7 @@ * * Related specs: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies + * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau * [ISO-10646] UTF-8 and UTF-16 in Annexes * [ISO-8859-1] ISO Latin-1 characters codes. * [UNICODE] The Unicode Consortium, "The Unicode Standard -- @@ -65,6 +66,73 @@ static int xmlLittleEndian = 1; * I hope we won't use values > 0xFFFF anytime soon ! */ +/** + * xmlGetUTF8Char: + * @utf: a sequence of UTF-8 encoded bytes + * @len: a pointer to @bytes len + * + * Read one UTF8 Char from @utf + * + * Returns the char value or -1 in case of error and update @len with the + * number of bytes used + */ +int +xmlGetUTF8Char(const unsigned char *utf, int *len) { + unsigned int c; + + if (utf == NULL) + goto error; + if (len == NULL) + goto error; + if (*len < 1) + goto error; + + c = utf[0]; + if (c & 0x80) { + if (*len < 2) + goto error; + if ((utf[1] & 0xc0) != 0x80) + goto error; + if ((c & 0xe0) == 0xe0) { + if (*len < 3) + goto error; + if ((utf[2] & 0xc0) != 0x80) + goto error; + if ((c & 0xf0) == 0xf0) { + if (*len < 4) + goto error; + if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) + goto error; + *len = 4; + /* 4-byte code */ + c = (utf[0] & 0x7) << 18; + c |= (utf[1] & 0x3f) << 12; + c |= (utf[2] & 0x3f) << 6; + c |= utf[3] & 0x3f; + } else { + /* 3-byte code */ + *len = 3; + c = (utf[0] & 0xf) << 12; + c |= (utf[1] & 0x3f) << 6; + c |= utf[2] & 0x3f; + } + } else { + /* 2-byte code */ + *len = 2; + c = (utf[0] & 0x1f) << 6; + c |= utf[1] & 0x3f; + } + } else { + /* 1-byte code */ + *len = 1; + } + return(c); + +error: + *len = 0; + return(-1); +} + /** * xmlCheckUTF8: Check utf-8 string for legality. * @utf: Pointer to putative utf-8 encoded string. @@ -236,7 +304,7 @@ UTF16LEToUTF8(unsigned char* out, int *outlen, (*inlenb)--; inlen = *inlenb / 2; inend = in + inlen; - while (in < inend) { + while ((in < inend) && (out - outstart + 5 < *outlen)) { if (xmlLittleEndian) { c= *in++; } else { @@ -246,9 +314,9 @@ UTF16LEToUTF8(unsigned char* out, int *outlen, in++; } if ((c & 0xFC00) == 0xD800) { /* surrogates */ - if (in >= inend) { /* (in > inend) shouldn't happens */ - break; - } + if (in >= inend) { /* (in > inend) shouldn't happens */ + break; + } if (xmlLittleEndian) { d = *in++; } else { @@ -317,6 +385,24 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen, unsigned char *tmp; unsigned short tmp1, tmp2; + if (in == NULL) { + /* + * initialization, add the Byte Order Mark + */ + if (*outlen >= 2) { + outb[0] = 0xFF; + outb[1] = 0xFE; + *outlen = 2; + *inlen = 0; +#ifdef DEBUG_ENCODING + fprintf(stderr, "Added FFFE Byte Order Mark\n"); +#endif + return(2); + } + *outlen = 0; + *inlen = 0; + return(0); + } outend = out + (*outlen / 2); while (in < inend) { d= *in++; @@ -385,7 +471,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen, break; processed = in; } - *outlen = out - outstart; + *outlen = (out - outstart) * 2; *inlen = processed - in; return(0); } @@ -509,6 +595,24 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen, unsigned char *tmp; unsigned short tmp1, tmp2; + if (in == NULL) { + /* + * initialization, add the Byte Order Mark + */ + if (*outlen >= 2) { + outb[0] = 0xFE; + outb[1] = 0xFF; + *outlen = 2; + *inlen = 0; +#ifdef DEBUG_ENCODING + fprintf(stderr, "Added FEFF Byte Order Mark\n"); +#endif + return(2); + } + *outlen = 0; + *inlen = 0; + return(0); + } outend = out + (*outlen / 2); while (in < inend) { d= *in++; @@ -574,7 +678,7 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen, break; processed = in; } - *outlen = out - outstart; + *outlen = (out - outstart) * 2; *inlen = processed - in; return(0); } @@ -695,6 +799,70 @@ xmlParseCharEncoding(const char* name) return(XML_CHAR_ENCODING_ERROR); } +/** + * xmlGetCharEncodingName: + * @enc: the encoding + * + * The "canonical" name for XML encoding. + * C.f. http://www.w3.org/TR/REC-xml#charencoding + * Section 4.3.3 Character Encoding in Entities + * + * Returns the canonical name for the given encoding + */ + +const char* +xmlGetCharEncodingName(xmlCharEncoding enc) { + switch (enc) { + case XML_CHAR_ENCODING_ERROR: + return(NULL); + case XML_CHAR_ENCODING_NONE: + return(NULL); + case XML_CHAR_ENCODING_UTF8: + return("UTF-8"); + case XML_CHAR_ENCODING_UTF16LE: + return("UTF-16"); + case XML_CHAR_ENCODING_UTF16BE: + return("UTF-16"); + case XML_CHAR_ENCODING_EBCDIC: + return("EBCDIC"); + case XML_CHAR_ENCODING_UCS4LE: + return("ISO-10646-UCS-4"); + case XML_CHAR_ENCODING_UCS4BE: + return("ISO-10646-UCS-4"); + case XML_CHAR_ENCODING_UCS4_2143: + return("ISO-10646-UCS-4"); + case XML_CHAR_ENCODING_UCS4_3412: + return("ISO-10646-UCS-4"); + case XML_CHAR_ENCODING_UCS2: + return("ISO-10646-UCS-2"); + case XML_CHAR_ENCODING_8859_1: + return("ISO-8859-1"); + case XML_CHAR_ENCODING_8859_2: + return("ISO-8859-2"); + case XML_CHAR_ENCODING_8859_3: + return("ISO-8859-3"); + case XML_CHAR_ENCODING_8859_4: + return("ISO-8859-4"); + case XML_CHAR_ENCODING_8859_5: + return("ISO-8859-5"); + case XML_CHAR_ENCODING_8859_6: + return("ISO-8859-6"); + case XML_CHAR_ENCODING_8859_7: + return("ISO-8859-7"); + case XML_CHAR_ENCODING_8859_8: + return("ISO-8859-8"); + case XML_CHAR_ENCODING_8859_9: + return("ISO-8859-9"); + case XML_CHAR_ENCODING_2022_JP: + return("ISO-2022-JP"); + case XML_CHAR_ENCODING_SHIFT_JIS: + return("Shift-JIS"); + case XML_CHAR_ENCODING_EUC_JP: + return("EUC-JP"); + } + return(NULL); +} + /**************************************************************** * * * Char encoding handlers * @@ -883,7 +1051,7 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) { handler = xmlFindCharEncodingHandler("ebcdic"); if (handler != NULL) return(handler); break; - case XML_CHAR_ENCODING_UCS4LE: + case XML_CHAR_ENCODING_UCS4BE: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); if (handler != NULL) return(handler); handler = xmlFindCharEncodingHandler("UCS-4"); @@ -891,8 +1059,12 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) { handler = xmlFindCharEncodingHandler("UCS4"); if (handler != NULL) return(handler); break; - case XML_CHAR_ENCODING_UCS4BE: - handler = xmlFindCharEncodingHandler("UCS4BE"); + case XML_CHAR_ENCODING_UCS4LE: + handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS-4"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS4"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_UCS4_2143: @@ -953,9 +1125,10 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) { */ xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name) { + xmlCharEncodingHandlerPtr enc; + xmlCharEncoding alias; #ifdef LIBXML_ICONV_ENABLED iconv_t icv_in, icv_out; - xmlCharEncodingHandlerPtr enc; #endif /* LIBXML_ICONV_ENABLED */ char upper[100]; int i; @@ -964,6 +1137,9 @@ xmlFindCharEncodingHandler(const char *name) { if (name == NULL) return(xmlDefaultCharEncodingHandler); if (name[0] == 0) return(xmlDefaultCharEncodingHandler); + /* + * Check first for directly registered encoding names + */ for (i = 0;i < 99;i++) { upper[i] = toupper(name[i]); if (upper[i] == 0) break; @@ -1002,9 +1178,23 @@ xmlFindCharEncodingHandler(const char *name) { fprintf(stderr, "iconv : problems with filters for '%s'\n", name); } #endif /* LIBXML_ICONV_ENABLED */ + #ifdef DEBUG_ENCODING fprintf(stderr, "No handler found for encoding %s\n", name); #endif + + /* + * Fallback using the canonical names + */ + alias = xmlParseCharEncoding(name); + if (alias != XML_CHAR_ENCODING_ERROR) { + const char* canon; + canon = xmlGetCharEncodingName(alias); + if ((canon != NULL) && (strcmp(name, canon))) { + return(xmlFindCharEncodingHandler(canon)); + } + } + return(NULL); } @@ -1040,8 +1230,13 @@ xmlIconvWrapper(iconv_t cd, ret = iconv(cd, &icv_in, &icv_inlen, &icv_out, &icv_outlen); - *inlen -= icv_inlen; - *outlen -= icv_outlen; + if (in != NULL) { + *inlen -= icv_inlen; + *outlen -= icv_outlen; + } else { + *inlen = 0; + *outlen = 0; + } if (icv_inlen != 0 || ret == (size_t) -1) { #ifdef EILSEQ if (errno == EILSEQ) { @@ -1066,6 +1261,91 @@ xmlIconvWrapper(iconv_t cd, } #endif /* LIBXML_ICONV_ENABLED */ +/** + * xmlCharEncFirstLine: + * @handler: char enconding transformation data structure + * @out: an xmlBuffer for the output. + * @in: an xmlBuffer for the input + * + * Front-end for the encoding handler input function, but handle only + * the very first line, i.e. limit itself to 45 chars. + * + * Returns the number of byte written if success, or + * -1 general error + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + */ +int +xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, + xmlBufferPtr in) { + int ret = -2; + int written; + int toconv; + + if (handler == NULL) return(-1); + if (out == NULL) return(-1); + if (in == NULL) return(-1); + + written = out->size - out->use; + toconv = in->use; + if (toconv * 2 >= written) { + xmlBufferGrow(out, toconv); + written = out->size - out->use - 1; + } + + /* + * echo '' | wc -c => 38 + * 45 chars should be sufficient to reach the end of the encoding + * decalration without going too far inside the document content. + */ + written = 45; + + if (handler->input != NULL) { + ret = handler->input(&out->content[out->use], &written, + in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + } +#ifdef LIBXML_ICONV_ENABLED + else if (handler->iconv_in != NULL) { + ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use], + &written, in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + if (ret == -1) ret = -3; + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef DEBUG_ENCODING + switch (ret) { + case 0: + fprintf(stderr, "converted %d bytes to %d bytes of input\n", + toconv, written); + break; + case -1: + fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n", + toconv, written, in->use); + break; + case -2: + fprintf(stderr, "input conversion failed due to input error\n"); + break; + case -3: + fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n", + toconv, written, in->use); + break; + default: + fprintf(stderr,"Unknown input conversion failed %d\n", ret); + } +#endif + /* + * Ignore when input buffer is not on a boundary + */ + if (ret == -3) ret = 0; + if (ret == -1) ret = 0; + return(ret); +} + /** * xmlCharEncInFunc: * @handler: char enconding transformation data structure @@ -1113,8 +1393,8 @@ xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, if (ret == -1) ret = -3; } #endif /* LIBXML_ICONV_ENABLED */ -#ifdef DEBUG_ENCODING switch (ret) { +#ifdef DEBUG_ENCODING case 0: fprintf(stderr, "converted %d bytes to %d bytes of input\n", toconv, written); @@ -1123,17 +1403,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n", toconv, written, in->use); break; - case -2: - fprintf(stderr, "input conversion failed due to input error\n"); - break; case -3: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n", toconv, written, in->use); break; - default: - fprintf(stderr,"Unknown input conversion failed %d\n", ret); - } #endif + case -2: + fprintf(stderr, "input conversion failed due to input error\n"); + fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + in->content[0], in->content[1], + in->content[2], in->content[3]); + } /* * Ignore when input buffer is not on a boundary */ @@ -1148,6 +1428,11 @@ xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, * @in: an xmlBuffer for the input * * Generic front-end for the encoding handler output function + * a first call with @in == NULL has to be made firs to initiate the + * output in case of non-stateless encoding needing to initiate their + * state or the output (like the BOM in UTF16). + * In case of UTF8 sequence conversion errors for the given encoder, + * the content will be automatically remapped to a CharRef sequence. * * Returns the number of byte written if success, or * -1 general error @@ -1160,12 +1445,43 @@ xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, int ret = -2; int written; int toconv; + int output = 0; if (handler == NULL) return(-1); if (out == NULL) return(-1); - if (in == NULL) return(-1); +retry: + written = out->size - out->use; + + /* + * First specific handling of in = NULL, i.e. the initialization call + */ + if (in == NULL) { + toconv = 0; + if (handler->output != NULL) { + ret = handler->output(&out->content[out->use], &written, + NULL, &toconv); + out->use += written; + out->content[out->use] = 0; + } +#ifdef LIBXML_ICONV_ENABLED + else if (handler->iconv_out != NULL) { + ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use], + &written, NULL, &toconv); + out->use += written; + out->content[out->use] = 0; + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef DEBUG_ENCODING + fprintf(stderr, "initialized encoder\n"); +#endif + return(0); + } + + /* + * Convertion itself. + */ toconv = in->use; if (toconv * 2 >= written) { xmlBufferGrow(out, toconv * 2); @@ -1173,7 +1489,7 @@ xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, } if (handler->output != NULL) { ret = handler->output(&out->content[out->use], &written, - in->content, &toconv); + in->content, &toconv); xmlBufferShrink(in, toconv); out->use += written; out->content[out->use] = 0; @@ -1188,8 +1504,14 @@ xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, if (ret == -1) ret = -3; } #endif /* LIBXML_ICONV_ENABLED */ -#ifdef DEBUG_ENCODING + + if (ret >= 0) output += ret; + + /* + * Attempt to handle error cases + */ switch (ret) { +#ifdef DEBUG_ENCODING case 0: fprintf(stderr, "converted %d bytes to %d bytes of output\n", toconv, written); @@ -1197,17 +1519,45 @@ xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, case -1: fprintf(stderr, "output conversion failed by lack of space\n"); break; - case -2: - fprintf(stderr, "output conversion failed due to output error\n"); - break; case -3: fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n", toconv, written, in->use); break; - default: - fprintf(stderr,"Unknown output conversion failed %d\n", ret); - } #endif + case -2: { + int len = in->use; + const char *utf = (const char *) in->content; + int cur; + + cur = xmlGetUTF8Char(utf, &len); + if (cur > 0) { + xmlChar charref[20]; + +#ifdef DEBUG_ENCODING + fprintf(stderr, "handling output conversion error\n"); + fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + in->content[0], in->content[1], + in->content[2], in->content[3]); +#endif + /* + * Removes the UTF8 sequence, and replace it by a charref + * and continue the transcoding phase, hoping the error + * did not mangle the encoder state. + */ + sprintf(charref, "&#x%X;", cur); + xmlBufferShrink(in, len); + xmlBufferAddHead(in, charref, -1); + + goto retry; + } else { + fprintf(stderr, "output conversion failed due to conv error\n"); + fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + in->content[0], in->content[1], + in->content[2], in->content[3]); + } + break; + } + } return(ret); } diff --git a/encoding.h b/encoding.h index f6edbf29..ce0ab755 100644 --- a/encoding.h +++ b/encoding.h @@ -102,6 +102,8 @@ typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen, * * Take a block of UTF-8 chars in and try to convert it to an other * encoding. + * Note: a first call designed to produce heading info is called with + * in = NULL. If stateful this should also initialize the encoder state * * Returns the number of byte written, or -1 by lack of space, or -2 * if the transcoding failed. @@ -136,6 +138,7 @@ void xmlRegisterCharEncodingHandler (xmlCharEncodingHandlerPtr handler); xmlCharEncoding xmlDetectCharEncoding (const unsigned char* in, int len); xmlCharEncoding xmlParseCharEncoding (const char* name); +const char* xmlGetCharEncodingName (xmlCharEncoding enc); xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler(xmlCharEncoding enc); xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name); int xmlCheckUTF8 (const unsigned char *utf); @@ -147,6 +150,9 @@ int xmlCharEncOutFunc (xmlCharEncodingHandler *handler, int xmlCharEncInFunc (xmlCharEncodingHandler *handler, xmlBufferPtr out, xmlBufferPtr in); +int xmlCharEncFirstLine (xmlCharEncodingHandler *handler, + xmlBufferPtr out, + xmlBufferPtr in); int xmlCharEncCloseFunc (xmlCharEncodingHandler *handler); #ifdef __cplusplus diff --git a/entities.c b/entities.c index 15247a2d..c541d679 100644 --- a/entities.c +++ b/entities.c @@ -945,6 +945,86 @@ xmlEncodeEntitiesReentrant(xmlDocPtr doc, const xmlChar *input) { return(buffer); } +/** + * xmlEncodeSpecialChars: + * @doc: the document containing the string + * @input: A string to convert to XML. + * + * Do a global encoding of a string, replacing the predefined entities + * this routine is reentrant, and result must be deallocated. + * + * Returns A newly allocated string with the substitution done. + */ +xmlChar * +xmlEncodeSpecialChars(xmlDocPtr doc, const xmlChar *input) { + const xmlChar *cur = input; + xmlChar *buffer = NULL; + xmlChar *out = NULL; + int buffer_size = 0; + int html = 0; + + if (input == NULL) return(NULL); + if (doc != NULL) + html = (doc->type == XML_HTML_DOCUMENT_NODE); + + /* + * allocate an translation buffer. + */ + buffer_size = 1000; + buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar)); + if (buffer == NULL) { + perror("malloc failed"); + return(NULL); + } + out = buffer; + + while (*cur != '\0') { + if (out - buffer > buffer_size - 10) { + int index = out - buffer; + + growBufferReentrant(); + out = &buffer[index]; + } + + /* + * By default one have to encode at least '<', '>', '"' and '&' ! + */ + if (*cur == '<') { + *out++ = '&'; + *out++ = 'l'; + *out++ = 't'; + *out++ = ';'; + } else if (*cur == '>') { + *out++ = '&'; + *out++ = 'g'; + *out++ = 't'; + *out++ = ';'; + } else if (*cur == '&') { + *out++ = '&'; + *out++ = 'a'; + *out++ = 'm'; + *out++ = 'p'; + *out++ = ';'; + } else if (*cur == '"') { + *out++ = '&'; + *out++ = 'q'; + *out++ = 'u'; + *out++ = 'o'; + *out++ = 't'; + *out++ = ';'; + } else { + /* + * Works because on UTF-8, all extended sequences cannot + * result in bytes in the ASCII range. + */ + *out++ = *cur; + } + cur++; + } + *out++ = 0; + return(buffer); +} + /** * xmlCreateEntitiesTable: * diff --git a/entities.h b/entities.h index f0ec7314..c9bd0035 100644 --- a/entities.h +++ b/entities.h @@ -106,6 +106,8 @@ const xmlChar * xmlEncodeEntities (xmlDocPtr doc, const xmlChar *input); xmlChar * xmlEncodeEntitiesReentrant(xmlDocPtr doc, const xmlChar *input); +xmlChar * xmlEncodeSpecialChars (xmlDocPtr doc, + const xmlChar *input); xmlEntitiesTablePtr xmlCreateEntitiesTable (void); xmlEntitiesTablePtr xmlCopyEntitiesTable (xmlEntitiesTablePtr table); void xmlFreeEntitiesTable (xmlEntitiesTablePtr table); diff --git a/example/Makefile.am b/example/Makefile.am index 0978f95a..ce7a8da9 100644 --- a/example/Makefile.am +++ b/example/Makefile.am @@ -6,5 +6,9 @@ INCLUDES = \ LDADD = $(top_builddir)/libxml.la @Z_LIBS@ -$(top_builddir)/libxml.la: +$(srcdir)/libxml: + -$(RM) $(srcdir)/libxml + ln -s $(srcdir)/. $(srcdir)/libxml + +$(top_builddir)/libxml.la: $(srcdir)/libxml (cd .. ; $(MAKE)) diff --git a/example/gjobread.c b/example/gjobread.c index ca386718..916fa53a 100644 --- a/example/gjobread.c +++ b/example/gjobread.c @@ -10,12 +10,18 @@ #include #include -#include -#if defined(LIBXML_VERSION) && LIBXML_VERSION >= 20000 +/* + * This example should compile and run indifferently with libxml-1.8.8 + + * and libxml2-2.1.0 + + * Check the COMPAT comments below + */ + +/* + * COMPAT using xml-config --cflags to get the include path this will + * work with both + */ +#include #include -#else -#include -#endif #define DEBUG(x) printf(x) @@ -50,12 +56,13 @@ DEBUG("parsePerson\n"); memset(ret, 0, sizeof(person)); /* We don't care what the top level element name is */ - cur = cur->children; + /* COMPAT xmlChildrenNode is a macro unifying libxml1 and libxml2 names */ + cur = cur->xmlChildrenNode; while (cur != NULL) { if ((!strcmp(cur->name, "Person")) && (cur->ns == ns)) - ret->name = xmlNodeListGetString(doc, cur->children, 1); + ret->name = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1); if ((!strcmp(cur->name, "Email")) && (cur->ns == ns)) - ret->email = xmlNodeListGetString(doc, cur->children, 1); + ret->email = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1); cur = cur->next; } @@ -108,7 +115,7 @@ DEBUG("parseJob\n"); memset(ret, 0, sizeof(job)); /* We don't care what the top level element name is */ - cur = cur->children; + cur = cur->xmlChildrenNode; while (cur != NULL) { if ((!strcmp(cur->name, "Project")) && (cur->ns == ns)) { @@ -118,9 +125,9 @@ DEBUG("parseJob\n"); } } if ((!strcmp(cur->name, "Application")) && (cur->ns == ns)) - ret->application = xmlNodeListGetString(doc, cur->children, 1); + ret->application = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1); if ((!strcmp(cur->name, "Category")) && (cur->ns == ns)) - ret->category = xmlNodeListGetString(doc, cur->children, 1); + ret->category = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1); if ((!strcmp(cur->name, "Contact")) && (cur->ns == ns)) ret->contact = parsePerson(doc, ns, cur); cur = cur->next; @@ -173,8 +180,6 @@ gJobPtr parseGjobFile(char *filename) { * Check the document is of the right kind */ - // cur = doc->root; - // cur = doc->children; cur = xmlDocGetRootElement(doc); if (cur == NULL) { fprintf(stderr,"empty document\n"); @@ -209,8 +214,7 @@ gJobPtr parseGjobFile(char *filename) { * Now, walk the tree. */ /* First level we expect just Jobs */ - // cur = cur->children; - cur = cur -> children; + cur = cur->xmlChildrenNode; while ( cur && xmlIsBlankNode ( cur ) ) { cur = cur -> next; @@ -229,7 +233,7 @@ gJobPtr parseGjobFile(char *filename) { } /* Second level is a list of Job, but be laxist */ - cur = cur->children; + cur = cur->xmlChildrenNode; while (cur != NULL) { if ((!strcmp(cur->name, "Job")) && (cur->ns == ns)) { job = parseJob(doc, ns, cur); @@ -257,6 +261,9 @@ int main(int argc, char **argv) { int i; gJobPtr cur; + /* COMPAT: Do not genrate nodes for formatting spaces */ + xmlKeepBlanksDefault(0); + for (i = 1; i < argc ; i++) { cur = parseGjobFile(argv[i]); if ( cur ) diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h index f6edbf29..ce0ab755 100644 --- a/include/libxml/encoding.h +++ b/include/libxml/encoding.h @@ -102,6 +102,8 @@ typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen, * * Take a block of UTF-8 chars in and try to convert it to an other * encoding. + * Note: a first call designed to produce heading info is called with + * in = NULL. If stateful this should also initialize the encoder state * * Returns the number of byte written, or -1 by lack of space, or -2 * if the transcoding failed. @@ -136,6 +138,7 @@ void xmlRegisterCharEncodingHandler (xmlCharEncodingHandlerPtr handler); xmlCharEncoding xmlDetectCharEncoding (const unsigned char* in, int len); xmlCharEncoding xmlParseCharEncoding (const char* name); +const char* xmlGetCharEncodingName (xmlCharEncoding enc); xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler(xmlCharEncoding enc); xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name); int xmlCheckUTF8 (const unsigned char *utf); @@ -147,6 +150,9 @@ int xmlCharEncOutFunc (xmlCharEncodingHandler *handler, int xmlCharEncInFunc (xmlCharEncodingHandler *handler, xmlBufferPtr out, xmlBufferPtr in); +int xmlCharEncFirstLine (xmlCharEncodingHandler *handler, + xmlBufferPtr out, + xmlBufferPtr in); int xmlCharEncCloseFunc (xmlCharEncodingHandler *handler); #ifdef __cplusplus diff --git a/include/libxml/entities.h b/include/libxml/entities.h index f0ec7314..c9bd0035 100644 --- a/include/libxml/entities.h +++ b/include/libxml/entities.h @@ -106,6 +106,8 @@ const xmlChar * xmlEncodeEntities (xmlDocPtr doc, const xmlChar *input); xmlChar * xmlEncodeEntitiesReentrant(xmlDocPtr doc, const xmlChar *input); +xmlChar * xmlEncodeSpecialChars (xmlDocPtr doc, + const xmlChar *input); xmlEntitiesTablePtr xmlCreateEntitiesTable (void); xmlEntitiesTablePtr xmlCopyEntitiesTable (xmlEntitiesTablePtr table); void xmlFreeEntitiesTable (xmlEntitiesTablePtr table); diff --git a/include/libxml/parser.h b/include/libxml/parser.h index aa7e2735..9cb73bd0 100644 --- a/include/libxml/parser.h +++ b/include/libxml/parser.h @@ -120,8 +120,8 @@ struct _xmlParserCtxt { xmlDocPtr myDoc; /* the document being built */ int wellFormed; /* is the document well formed */ int replaceEntities; /* shall we replace entities ? */ - const xmlChar *version; /* the XML version string */ - const xmlChar *encoding; /* encoding, if any */ + const xmlChar *version; /* the XML version string */ + const xmlChar *encoding; /* the declared encoding, if any */ int standalone; /* standalone document */ int html; /* are we parsing an HTML document */ @@ -177,7 +177,11 @@ struct _xmlParserCtxt { int * spaceTab; /* array of space infos */ int depth; /* to prevent entity substitution loops */ - xmlParserInputPtr entity; /* used to check entities boundaries */ + xmlParserInputPtr entity; /* used to check entities boundaries */ + int charset; /* encoding of the in-memory content + actually an xmlCharEncoding */ + int nodelen; /* Those two fields are there to */ + int nodemem; /* Speed up large node parsing */ }; /** diff --git a/include/libxml/tree.h b/include/libxml/tree.h index 35ea5256..9c5b2805 100644 --- a/include/libxml/tree.h +++ b/include/libxml/tree.h @@ -181,6 +181,7 @@ struct _xmlElement { xmlElementTypeVal etype; /* The type */ xmlElementContentPtr content; /* the allowed element content */ xmlAttributePtr attributes; /* List of the declared attributes */ + const xmlChar *prefix; /* the namespace prefix if any */ }; /* @@ -347,12 +348,22 @@ struct _xmlDoc { struct _xmlDtd *extSubset; /* the document external subset */ struct _xmlNs *oldNs; /* Global namespace, the old way */ const xmlChar *version; /* the XML version string */ - const xmlChar *encoding; /* encoding, if any */ + const xmlChar *encoding; /* external initial encoding, if any */ void *ids; /* Hash table for ID attributes if any */ void *refs; /* Hash table for IDREFs attributes if any */ const xmlChar *URL; /* The URI for that document */ + int charset; /* encoding of the in-memory content + actually an xmlCharEncoding */ }; +/* + * Compatibility naming layer with libxml1 + */ +#ifndef xmlChildrenNode +#define xmlChildrenNode children +#define xmlRootNode children +#endif + /* * Variables. */ @@ -374,6 +385,9 @@ int xmlBufferDump (FILE *file, void xmlBufferAdd (xmlBufferPtr buf, const xmlChar *str, int len); +void xmlBufferAddHead (xmlBufferPtr buf, + const xmlChar *str, + int len); void xmlBufferCat (xmlBufferPtr buf, const xmlChar *str); void xmlBufferCCat (xmlBufferPtr buf, @@ -545,6 +559,9 @@ xmlNodePtr xmlStringLenGetNodeList (xmlDocPtr doc, xmlChar * xmlNodeListGetString (xmlDocPtr doc, xmlNodePtr list, int inLine); +xmlChar * xmlNodeListGetRawString (xmlDocPtr doc, + xmlNodePtr list, + int inLine); void xmlNodeSetContent (xmlNodePtr cur, const xmlChar *content); void xmlNodeSetContentLen (xmlNodePtr cur, @@ -591,7 +608,7 @@ int xmlReconciliateNs (xmlDocPtr doc, void xmlDocDumpMemory (xmlDocPtr cur, xmlChar**mem, int *size); -void xmlDocDump (FILE *f, +int xmlDocDump (FILE *f, xmlDocPtr cur); void xmlElemDump (FILE *f, xmlDocPtr doc, @@ -599,6 +616,17 @@ void xmlElemDump (FILE *f, int xmlSaveFile (const char *filename, xmlDocPtr cur); +/* This one is exported from xmlIO.h + +int xmlSaveFileTo (xmlOutputBuffer *buf, + xmlDocPtr cur, + const char *encoding); + */ + +int xmlSaveFileEnc (const char *filename, + xmlDocPtr cur, + const char *encoding); + /* * Compression */ diff --git a/include/libxml/uri.h b/include/libxml/uri.h index 59c4467c..705e3851 100644 --- a/include/libxml/uri.h +++ b/include/libxml/uri.h @@ -43,6 +43,8 @@ xmlURIPtr xmlCreateURI (void); xmlChar * xmlBuildURI (const xmlChar *URI, const xmlChar *base); xmlURIPtr xmlParseURI (const char *URI); +int xmlParseURIReference (xmlURIPtr uri, + const char *str); xmlChar * xmlSaveUri (xmlURIPtr uri); void xmlPrintURI (FILE *stream, xmlURIPtr uri); diff --git a/include/libxml/xmlIO.h b/include/libxml/xmlIO.h index 2d14ebeb..5289367e 100644 --- a/include/libxml/xmlIO.h +++ b/include/libxml/xmlIO.h @@ -18,6 +18,11 @@ extern "C" { #endif +/* + * Those are the functions and datatypes for the parser input + * I/O structures. + */ + typedef int (*xmlInputMatchCallback) (char const *filename); typedef void * (*xmlInputOpenCallback) (char const *filename); typedef int (*xmlInputReadCallback) (void * context, char * buffer, int len); @@ -32,13 +37,38 @@ struct _xmlParserInputBuffer { xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */ - xmlBufferPtr buffer; /* Local buffer encoded in UTF-8 */ + xmlBufferPtr buffer; /* Local buffer encoded in UTF-8 or ISOLatin */ xmlBufferPtr raw; /* if encoder != NULL buffer for raw input */ }; /* - * Interfaces + * Those are the functions and datatypes for the library output + * I/O structures. + */ + +typedef int (*xmlOutputMatchCallback) (char const *filename); +typedef void * (*xmlOutputOpenCallback) (char const *filename); +typedef int (*xmlOutputWriteCallback) (void * context, const char * buffer, + int len); +typedef void (*xmlOutputCloseCallback) (void * context); + +typedef struct _xmlOutputBuffer xmlOutputBuffer; +typedef xmlOutputBuffer *xmlOutputBufferPtr; +struct _xmlOutputBuffer { + void* context; + xmlOutputWriteCallback writecallback; + xmlOutputCloseCallback closecallback; + + xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */ + + xmlBufferPtr buffer; /* Local buffer encoded in UTF-8 or ISOLatin */ + xmlBufferPtr conv; /* if encoder != NULL buffer for output */ + int written; /* total number of byte written */ +}; + +/* + * Interfaces for input */ xmlParserInputBufferPtr @@ -72,6 +102,51 @@ int xmlRegisterInputCallbacks (xmlInputMatchCallback match, xmlInputOpenCallback open, xmlInputReadCallback read, xmlInputCloseCallback close); +/* + * Interfaces for output + */ +xmlOutputBufferPtr + xmlAllocOutputBuffer (xmlCharEncodingHandlerPtr encoder); + +xmlOutputBufferPtr + xmlOutputBufferCreateFilename (const char *URI, + xmlCharEncodingHandlerPtr encoder, + int compression); + +xmlOutputBufferPtr + xmlOutputBufferCreateFile (FILE *file, + xmlCharEncodingHandlerPtr encoder); + +xmlOutputBufferPtr + xmlOutputBufferCreateFd (int fd, + xmlCharEncodingHandlerPtr encoder); + +xmlOutputBufferPtr + xmlOutputBufferCreateIO (xmlOutputWriteCallback iowrite, + xmlOutputCloseCallback ioclose, + void *ioctx, + xmlCharEncodingHandlerPtr encoder); + +int xmlOutputBufferWrite (xmlOutputBufferPtr out, + int len, + const char *buf); +int xmlOutputBufferWriteString (xmlOutputBufferPtr out, + const char *str); + +int xmlOutputBufferFlush (xmlOutputBufferPtr out); +int xmlOutputBufferClose (xmlOutputBufferPtr out); + +int xmlRegisterOutputCallbacks (xmlOutputMatchCallback match, + xmlOutputOpenCallback open, + xmlOutputWriteCallback write, + xmlOutputCloseCallback close); + +/* + * This save function is part of tree.h actually + */ +int xmlSaveFileTo (xmlOutputBuffer *buf, + xmlDocPtr cur, + const char *encoding); #ifdef __cplusplus } #endif diff --git a/include/libxml/xmlversion.h.in b/include/libxml/xmlversion.h.in index 5a62c27b..b5edc09f 100644 --- a/include/libxml/xmlversion.h.in +++ b/include/libxml/xmlversion.h.in @@ -9,8 +9,14 @@ #ifndef __XML_VERSION_H__ #define __XML_VERSION_H__ +/* + * use those to be sure nothing nasty will happen if + * your library and includes mismatch + */ +extern void xmlCheckVersion(int version); #define LIBXML_VERSION @LIBXML_VERSION_NUMBER@ #define LIBXML_VERSION_STRING "@LIBXML_VERSION_NUMBER@" +#define LIBXML_TEST_VERSION xmlCheckVersion(@LIBXML_VERSION_NUMBER@); /* * Whether the FTP support is configured in diff --git a/include/libxml/xpath.h b/include/libxml/xpath.h index 73e3f659..fc567fd8 100644 --- a/include/libxml/xpath.h +++ b/include/libxml/xpath.h @@ -29,8 +29,8 @@ typedef xmlXPathParserContext *xmlXPathParserContextPtr; typedef struct _xmlNodeSet xmlNodeSet; typedef xmlNodeSet *xmlNodeSetPtr; struct _xmlNodeSet { - int nodeNr; /* # of node in the set */ - int nodeMax; /* allocated space */ + int nodeNr; /* number of nodes in the set */ + int nodeMax; /* size of the array as allocated */ xmlNodePtr *nodeTab; /* array of nodes in no particular order */ }; @@ -41,6 +41,8 @@ struct _xmlNodeSet { * - boolean * - number * - string + * + * @@ XPointer will add more types ! */ #define XPATH_UNDEFINED 0 diff --git a/nanoftp.c b/nanoftp.c index 6ca37013..9ebe015e 100644 --- a/nanoftp.c +++ b/nanoftp.c @@ -1193,7 +1193,9 @@ xmlNanoFTPGetConnection(void *ctx) { if (sscanf(cur, "%d,%d,%d,%d,%d,%d", &temp[0], &temp[1], &temp[2], &temp[3], &temp[4], &temp[5]) != 6) { fprintf(stderr, "Invalid answer to PASV\n"); - close(ctxt->dataFd); ctxt->dataFd = -1; + if (ctxt->dataFd != -1) { + close(ctxt->dataFd); ctxt->dataFd = -1; + } return(-1); } for (i=0; i<6; i++) ad[i] = (unsigned char) (temp[i] & 0xff); @@ -1448,6 +1450,8 @@ xmlNanoFTPList(void *ctx, ftpListCallback callback, void *userData, if (xmlNanoFTPCwd(ctxt, ctxt->path) < 1) return(-1); ctxt->dataFd = xmlNanoFTPGetConnection(ctxt); + if (ctxt->dataFd == -1) + return(-1); #ifndef HAVE_SNPRINTF len = sprintf(buf, "LIST -L\r\n"); #else /* HAVE_SNPRINTF */ @@ -1459,6 +1463,8 @@ xmlNanoFTPList(void *ctx, ftpListCallback callback, void *userData, return(-1); } ctxt->dataFd = xmlNanoFTPGetConnection(ctxt); + if (ctxt->dataFd == -1) + return(-1); #ifndef HAVE_SNPRINTF len = sprintf(buf, "LIST -L %s\r\n", filename); #else /* HAVE_SNPRINTF */ @@ -1554,6 +1560,8 @@ xmlNanoFTPGetSocket(void *ctx, const char *filename) { if ((filename == NULL) && (ctxt->path == NULL)) return(-1); ctxt->dataFd = xmlNanoFTPGetConnection(ctxt); + if (ctxt->dataFd == -1) + return(-1); #ifndef HAVE_SNPRINTF len = sprintf(buf, "TYPE I\r\n"); diff --git a/nanohttp.c b/nanohttp.c index a94d6bb0..33d0b7ae 100644 --- a/nanohttp.c +++ b/nanohttp.c @@ -654,6 +654,23 @@ xmlNanoHTTPConnectAttempt(struct in_addr ia, int port) close(s); return(-1); } + + if ( FD_ISSET(s, &wfd) ) { + socklen_t len; + len = sizeof(status); + if (getsockopt(s, SOL_SOCKET, SO_ERROR, &status, &len) < 0 ) { + /* Solaris error code */ + return (-1); + } + if ( status ) { + close (s); + errno = status; + return (-1); + } + } else { + /* pbm */ + return (-1); + } return(s); } diff --git a/parser.c b/parser.c index 6714d3cf..6667106d 100644 --- a/parser.c +++ b/parser.c @@ -47,7 +47,6 @@ #define XML_PARSER_BIG_BUFFER_SIZE 1000 #define XML_PARSER_BUFFER_SIZE 100 -const char *xmlParserVersion = LIBXML_VERSION_STRING; int xmlGetWarningsDefaultValue = 1; /* @@ -63,6 +62,37 @@ void xmlParserHandleReference(xmlParserCtxtPtr ctxt); void xmlParserHandlePEReference(xmlParserCtxtPtr ctxt); xmlEntityPtr xmlParseStringPEReference(xmlParserCtxtPtr ctxt, const xmlChar **str); + +/* + * Version handling + */ +const char *xmlParserVersion = LIBXML_VERSION_STRING; + +/* + * xmlCheckVersion: + * @version: the include version number + * + * check the compiled lib version against the include one. + * This can warn or immediately kill the application + */ +void +xmlCheckVersion(int version) { + int myversion = LIBXML_VERSION; + + if ((myversion / 10000) != (version / 10000)) { + fprintf(stderr, + "Fatal: program compiled against libxml %d using libxml %d\n", + (version / 10000), (myversion / 10000)); + exit(1); + } + if ((myversion / 100) < (version / 100)) { + fprintf(stderr, + "Warning: program compiled against libxml %d using older %d\n", + (version / 100), (myversion / 100)); + } +} + + /************************************************************************ * * * Input handling functions for progressive parsing * @@ -431,7 +461,7 @@ xmlNextChar(xmlParserCtxtPtr ctxt) { if (*(ctxt->input->cur) == '\n') { ctxt->input->line++; ctxt->input->col = 1; } else ctxt->input->col++; - if (ctxt->encoding == NULL) { + if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: @@ -522,12 +552,16 @@ encoding_error: * to ISO-Latin-1 (if you don't like this policy, just declare the * encoding !) */ - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) { ctxt->sax->error(ctxt->userData, "Input is not proper UTF-8, indicate encoding !\n"); + ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + ctxt->input->cur[0], ctxt->input->cur[1], + ctxt->input->cur[2], ctxt->input->cur[3]); + } ctxt->errNo = XML_ERR_INVALID_ENCODING; - ctxt->encoding = xmlStrdup(BAD_CAST "ISO-8859-1"); + ctxt->charset = XML_CHAR_ENCODING_8859_1; ctxt->input->cur++; return; } @@ -556,7 +590,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { *len = 0; return(ctxt->token); } - if (ctxt->encoding == NULL) { + if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: @@ -654,12 +688,16 @@ encoding_error: * to ISO-Latin-1 (if you don't like this policy, just declare the * encoding !) */ - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) { ctxt->sax->error(ctxt->userData, "Input is not proper UTF-8, indicate encoding !\n"); + ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + ctxt->input->cur[0], ctxt->input->cur[1], + ctxt->input->cur[2], ctxt->input->cur[3]); + } ctxt->errNo = XML_ERR_INVALID_ENCODING; - ctxt->encoding = xmlStrdup(BAD_CAST "ISO-8859-1"); + ctxt->charset = XML_CHAR_ENCODING_8859_1; *len = 1; return((int) *ctxt->input->cur); } @@ -678,7 +716,7 @@ encoding_error: int xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar *cur, int *len) { - if (ctxt->encoding == NULL) { + if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: @@ -755,9 +793,13 @@ encoding_error: * to ISO-Latin-1 (if you don't like this policy, just declare the * encoding !) */ - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) { ctxt->sax->error(ctxt->userData, "Input is not proper UTF-8, indicate encoding !\n"); + ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + ctxt->input->cur[0], ctxt->input->cur[1], + ctxt->input->cur[2], ctxt->input->cur[3]); + } ctxt->errNo = XML_ERR_INVALID_ENCODING; *len = 1; @@ -1224,6 +1266,7 @@ xmlInitParserCtxt(xmlParserCtxtPtr ctxt) ctxt->inSubset = 0; ctxt->errNo = XML_ERR_OK; ctxt->depth = 0; + ctxt->charset = XML_CHAR_ENCODING_UTF8; xmlInitNodeInfoSeq(&ctxt->node_seq); } @@ -1463,7 +1506,7 @@ xmlParseStringCharRef(xmlParserCtxtPtr ctxt, const xmlChar **str) { ctxt->errNo = XML_ERR_INVALID_HEX_CHARREF; if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, - "xmlParseCharRef: invalid hexadecimal value\n"); + "xmlParseStringCharRef: invalid hexadecimal value\n"); ctxt->wellFormed = 0; ctxt->disableSAX = 1; val = 0; @@ -1484,7 +1527,7 @@ xmlParseStringCharRef(xmlParserCtxtPtr ctxt, const xmlChar **str) { ctxt->errNo = XML_ERR_INVALID_DEC_CHARREF; if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, - "xmlParseCharRef: invalid decimal value\n"); + "xmlParseStringCharRef: invalid decimal value\n"); ctxt->wellFormed = 0; ctxt->disableSAX = 1; val = 0; @@ -2297,9 +2340,11 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) break; case XML_CHAR_ENCODING_NONE: /* let's assume it's UTF-8 without the XML decl */ + ctxt->charset = XML_CHAR_ENCODING_UTF8; return(0); case XML_CHAR_ENCODING_UTF8: /* default encoding, no conversion should be needed */ + ctxt->charset = XML_CHAR_ENCODING_UTF8; return(0); default: break; @@ -2316,12 +2361,15 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) ctxt->sax->error(ctxt->userData, "encoding unknown\n"); ctxt->wellFormed = 0; ctxt->disableSAX = 1; + ctxt->charset = XML_CHAR_ENCODING_UTF8; break; case XML_CHAR_ENCODING_NONE: /* let's assume it's UTF-8 without the XML decl */ + ctxt->charset = XML_CHAR_ENCODING_UTF8; return(0); case XML_CHAR_ENCODING_UTF8: /* default encoding, no conversion should be needed */ + ctxt->charset = XML_CHAR_ENCODING_UTF8; return(0); case XML_CHAR_ENCODING_UTF16LE: break; @@ -2380,6 +2428,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) (ctxt->input->encoding != NULL)) { ctxt->encoding = xmlStrdup(ctxt->input->encoding); } + ctxt->charset = enc; return(0); case XML_CHAR_ENCODING_2022_JP: ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; @@ -2403,6 +2452,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) } if (handler == NULL) return(-1); + ctxt->charset = XML_CHAR_ENCODING_UTF8; return(xmlSwitchToEncoding(ctxt, handler)); } @@ -2427,10 +2477,14 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) if (ctxt->input->buf->encoder != NULL) { if (ctxt->input->buf->encoder == handler) return(0); - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "xmlSwitchEncoding : encoder already regitered\n"); - return(-1); + /* + * Note: this is a bit dangerous, but that's what it + * takes to use nearly compatible signature for different + * encodings. + */ + xmlCharEncCloseFunc(ctxt->input->buf->encoder); + ctxt->input->buf->encoder = handler; + return(0); } ctxt->input->buf->encoder = handler; @@ -2468,12 +2522,14 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) ctxt->input->buf->buffer = xmlBufferCreate(); /* - * convert as much as possible of the raw input - * to the parser reading buffer. + * convert just enough to get + * '' + * parsed with the autodetected encoding + * into the parser reading buffer. */ - nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, - ctxt->input->buf->buffer, - ctxt->input->buf->raw); + nbchars = xmlCharEncFirstLine(ctxt->input->buf->encoder, + ctxt->input->buf->buffer, + ctxt->input->buf->raw); if (nbchars < 0) { fprintf(stderr, "xmlSwitchToEncoding: encoder error\n"); return(-1); @@ -2536,10 +2592,7 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) /* * The parsing is now done in UTF8 natively */ - if (ctxt->encoding != NULL) { - xmlFree((xmlChar *) ctxt->encoding); - ctxt->encoding = NULL; - } + ctxt->charset = XML_CHAR_ENCODING_UTF8; } else return(-1); return(0); @@ -3740,13 +3793,12 @@ xmlParseEntityValue(xmlParserCtxtPtr ctxt, xmlChar **orig) { xmlChar * xmlParseAttValue(xmlParserCtxtPtr ctxt) { xmlChar limit = 0; - xmlChar *buffer = NULL; - int buffer_size = 0; - xmlChar *out = NULL; - + xmlChar *buf = NULL; + int len = 0; + int buf_size = 0; + int c, l; xmlChar *current = NULL; xmlEntityPtr ent; - xmlChar cur; SHRINK; @@ -3770,24 +3822,24 @@ xmlParseAttValue(xmlParserCtxtPtr ctxt) { /* * allocate a translation buffer. */ - buffer_size = XML_PARSER_BUFFER_SIZE; - buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar)); - if (buffer == NULL) { + buf_size = XML_PARSER_BUFFER_SIZE; + buf = (xmlChar *) xmlMalloc(buf_size * sizeof(xmlChar)); + if (buf == NULL) { perror("xmlParseAttValue: malloc failed"); return(NULL); } - out = buffer; /* * Ok loop until we reach one of the ending char or a size limit. */ - cur = CUR; - while (((NXT(0) != limit) && (cur != '<')) || (ctxt->token != 0)) { - if (cur == 0) break; - if ((cur == '&') && (NXT(1) == '#')) { + c = CUR_CHAR(l); + while (((NXT(0) != limit) && (c != '<')) || (ctxt->token != 0)) { + if (c == 0) break; + if ((c == '&') && (NXT(1) == '#')) { int val = xmlParseCharRef(ctxt); - *out++ = val; - } else if (cur == '&') { + COPY_BUF(l,buf,len,val); + NEXTL(l); + } else if (c == '&') { ent = xmlParseEntityRef(ctxt); if ((ent != NULL) && (ctxt->replaceEntities != 0)) { @@ -3799,19 +3851,16 @@ xmlParseAttValue(xmlParserCtxtPtr ctxt) { if (rep != NULL) { current = rep; while (*current != 0) { - *out++ = *current++; - if (out - buffer > buffer_size - 10) { - int index = out - buffer; - - growBuffer(buffer); - out = &buffer[index]; + buf[len++] = *current++; + if (len > buf_size - 10) { + growBuffer(buf); } } xmlFree(rep); } } else { if (ent->content != NULL) - *out++ = ent->content[0]; + buf[len++] = ent->content[0]; } } else if (ent != NULL) { int i = xmlStrlen(ent->name); @@ -3832,41 +3881,32 @@ xmlParseAttValue(xmlParserCtxtPtr ctxt) { /* * Just output the reference */ - *out++ = '&'; - if (out - buffer > buffer_size - i - 10) { - int index = out - buffer; - - growBuffer(buffer); - out = &buffer[index]; + buf[len++] = '&'; + if (len > buf_size - i - 10) { + growBuffer(buf); } for (;i > 0;i--) - *out++ = *cur++; - *out++ = ';'; + buf[len++] = *cur++; + buf[len++] = ';'; } } else { - /* invalid for UTF-8 , use COPY(out); !!! */ - if ((cur == 0x20) || (cur == 0xD) || (cur == 0xA) || (cur == 0x9)) { - *out++ = 0x20; - if (out - buffer > buffer_size - 10) { - int index = out - buffer; - - growBuffer(buffer); - out = &buffer[index]; + if ((c == 0x20) || (c == 0xD) || (c == 0xA) || (c == 0x9)) { + COPY_BUF(l,buf,len,0x20); + if (len > buf_size - 10) { + growBuffer(buf); } } else { - *out++ = cur; - if (out - buffer > buffer_size - 10) { - int index = out - buffer; - - growBuffer(buffer); - out = &buffer[index]; + COPY_BUF(l,buf,len,c); + if (len > buf_size - 10) { + growBuffer(buf); } } - NEXT; + NEXTL(l); } - cur = CUR; + GROW; + c = CUR_CHAR(l); } - *out++ = 0; + buf[len++] = 0; if (RAW == '<') { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, @@ -3882,7 +3922,7 @@ xmlParseAttValue(xmlParserCtxtPtr ctxt) { ctxt->disableSAX = 1; } else NEXT; - return(buffer); + return(buf); } /** @@ -6341,7 +6381,7 @@ xmlParseReference(xmlParserCtxtPtr ctxt) { int hex = NXT(2); int val = xmlParseCharRef(ctxt); - if (ctxt->encoding != NULL) { + if (ctxt->charset != XML_CHAR_ENCODING_UTF8) { /* * So we are using non-UTF-8 buffers * Check that the char fit on 8bits, if not @@ -6507,7 +6547,7 @@ xmlParseReference(xmlParserCtxtPtr ctxt) { ctxt->instate = XML_PARSER_EOF; return; } - if (input->standalone) { + if (input->standalone == 1) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "external parsed entities cannot be standalone\n"); @@ -9557,15 +9597,6 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size, } else if (ctxt->instate != XML_PARSER_EOF) xmlParseTryOrFinish(ctxt, terminate); if (terminate) { - /* - * Grab the encoding if it was added on-the-fly - */ - if ((ctxt->encoding != NULL) && (ctxt->myDoc != NULL) && - (ctxt->myDoc->encoding == NULL)) { - ctxt->myDoc->encoding = ctxt->encoding; - ctxt->encoding = NULL; - } - /* * Check for termination */ diff --git a/parser.h b/parser.h index aa7e2735..9cb73bd0 100644 --- a/parser.h +++ b/parser.h @@ -120,8 +120,8 @@ struct _xmlParserCtxt { xmlDocPtr myDoc; /* the document being built */ int wellFormed; /* is the document well formed */ int replaceEntities; /* shall we replace entities ? */ - const xmlChar *version; /* the XML version string */ - const xmlChar *encoding; /* encoding, if any */ + const xmlChar *version; /* the XML version string */ + const xmlChar *encoding; /* the declared encoding, if any */ int standalone; /* standalone document */ int html; /* are we parsing an HTML document */ @@ -177,7 +177,11 @@ struct _xmlParserCtxt { int * spaceTab; /* array of space infos */ int depth; /* to prevent entity substitution loops */ - xmlParserInputPtr entity; /* used to check entities boundaries */ + xmlParserInputPtr entity; /* used to check entities boundaries */ + int charset; /* encoding of the in-memory content + actually an xmlCharEncoding */ + int nodelen; /* Those two fields are there to */ + int nodemem; /* Speed up large node parsing */ }; /** diff --git a/result/HTML/test3.html b/result/HTML/test3.html index ad979574..597a1a42 100644 --- a/result/HTML/test3.html +++ b/result/HTML/test3.html @@ -1,6 +1,7 @@ +

Component Package diagram ProblemDomain


@@ -53,4 +54,5 @@ DataManagement + diff --git a/result/HTML/wired.html b/result/HTML/wired.html index 509bc1b5..d4439bbc 100644 --- a/result/HTML/wired.html +++ b/result/HTML/wired.html @@ -9,7 +9,7 @@ - @@ -89,7 +89,7 @@