From 365e13be6c7faa99d2797b973e39b3491ab7c18a Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Sun, 2 Jul 2000 07:56:37 +0000 Subject: [PATCH] Work on character encoding support for the HTML parser Fixed some autoopen/autoclose probs for the HTML parser Fixed a potential memleak in the encoding stuff Daniel. --- ChangeLog | 6 + HTMLparser.c | 339 +++++++++++++++++++++++++++------------------------ doc/xml.html | 14 +-- encoding.c | 2 +- 4 files changed, 193 insertions(+), 168 deletions(-) diff --git a/ChangeLog b/ChangeLog index 72686396..31d42a6b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +Sun Jul 2 09:52:45 MEST 2000 Daniel Veillard + + * HTMLparser.c: Work on character encoding support for the HTML parser + * HTMLparser.c: Fixed some autoopen/autoclose probs for the HTML parser + * encoding.c: Fixed a potential memleak in the encoding stuff + Sat Jul 1 13:44:22 MEST 2000 Daniel Veillard * doc/FAQ.html doc/Makefile.am : added a FAQ diff --git a/HTMLparser.c b/HTMLparser.c index 375a038e..ff331488 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -479,17 +479,19 @@ htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) { /** * htmlAutoClose: * @ctxt: an HTML parser context - * @new: The new tag name + * @new: The new tag name or NULL * * The HTmL DtD allows a tag to implicitely close other tags. * The list is kept in htmlStartClose array. This function is * called when a new tag has been detected and generates the * appropriates closes if possible/needed. + * If new is NULL this mean we are at the end of the resource + * and we should check */ void htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) { xmlChar *oldname; - while ((ctxt->name != NULL) && + while ((new != NULL) && (ctxt->name != NULL) && (htmlCheckAutoClose(new, ctxt->name))) { #ifdef DEBUG fprintf(stderr,"htmlAutoClose: %s closes %s\n", new, ctxt->name); @@ -504,6 +506,24 @@ htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) { xmlFree(oldname); } } + while ((new == NULL) && (ctxt->name != NULL) && + ((!xmlStrcmp(ctxt->name, BAD_CAST"head")) || + (!xmlStrcmp(ctxt->name, BAD_CAST"body")) || + (!xmlStrcmp(ctxt->name, BAD_CAST"html")))) { +#ifdef DEBUG + fprintf(stderr,"htmlAutoClose: EOF closes %s\n", ctxt->name); +#endif + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, ctxt->name); + oldname = htmlnamePop(ctxt); + if (oldname != NULL) { +#ifdef DEBUG + fprintf(stderr,"htmlAutoClose: popped %s\n", oldname); +#endif + xmlFree(oldname); + } + } + } /** @@ -1086,140 +1106,6 @@ htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, return(buffer); } - -/************************************************************************ - * * - * Commodity functions to handle encodings * - * * - ************************************************************************/ - -/** - * htmlSwitchEncoding: - * @ctxt: the parser context - * @len: the len of @cur - * - * change the input functions when discovering the character encoding - * of a given entity. - * - */ -void -htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc) -{ - switch (enc) { - case XML_CHAR_ENCODING_ERROR: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, "encoding unknown\n"); - ctxt->wellFormed = 0; - break; - case XML_CHAR_ENCODING_NONE: - /* let's assume it's UTF-8 without the XML decl */ - return; - case XML_CHAR_ENCODING_UTF8: - /* default encoding, no conversion should be needed */ - return; - case XML_CHAR_ENCODING_UTF16LE: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding UTF16 little endian not supported\n"); - break; - case XML_CHAR_ENCODING_UTF16BE: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding UTF16 big endian not supported\n"); - break; - case XML_CHAR_ENCODING_UCS4LE: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding USC4 little endian not supported\n"); - break; - case XML_CHAR_ENCODING_UCS4BE: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding USC4 big endian not supported\n"); - break; - case XML_CHAR_ENCODING_EBCDIC: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding EBCDIC not supported\n"); - break; - case XML_CHAR_ENCODING_UCS4_2143: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding UCS4 2143 not supported\n"); - break; - case XML_CHAR_ENCODING_UCS4_3412: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding UCS4 3412 not supported\n"); - break; - case XML_CHAR_ENCODING_UCS2: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding UCS2 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_1: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_1 ISO Latin 1 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_2: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_2 ISO Latin 2 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_3: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_3 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_4: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_4 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_5: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_5 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_6: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_6 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_7: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_7 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_8: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_8 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_9: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_9 not supported\n"); - break; - case XML_CHAR_ENCODING_2022_JP: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO-2022-JPnot supported\n"); - break; - case XML_CHAR_ENCODING_SHIFT_JIS: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding Shift_JISnot supported\n"); - break; - case XML_CHAR_ENCODING_EUC_JP: - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding EUC-JPnot supported\n"); - break; - } -} - /************************************************************************ * * * Commodity functions to handle streams * @@ -2227,6 +2113,112 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { return(name); } +/** + * htmlCheckEncoding: + * @ctxt: an HTML parser context + * @attvalue: the attribute value + * + * Checks an http-equiv attribute from a Meta tag to detect + * the encoding + * If a new encoding is detected the parser is switched to decode + * it and pass UTF8 + */ +void +htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { + const xmlChar *encoding; + + if ((ctxt == NULL) || (attvalue == NULL)) + return; + +fprintf(stderr, "htmlCheckEncoding: \"%s\"\n", attvalue); + + encoding = xmlStrstr(attvalue, BAD_CAST"charset="); + if (encoding == NULL) + encoding = xmlStrstr(attvalue, BAD_CAST"Charset="); + if (encoding == NULL) + encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET="); + if (encoding != NULL) { + encoding += 8; + } else { + encoding = xmlStrstr(attvalue, BAD_CAST"charset ="); + if (encoding == NULL) + encoding = xmlStrstr(attvalue, BAD_CAST"Charset ="); + if (encoding == NULL) + encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET ="); + if (encoding != NULL) + encoding += 9; + } + if (encoding != NULL) { + xmlCharEncoding enc; + xmlCharEncodingHandlerPtr handler; + + while ((*encoding == ' ') || (*encoding == '\t')) encoding++; + + if (ctxt->input->encoding != NULL) + xmlFree((xmlChar *) ctxt->input->encoding); + ctxt->input->encoding = xmlStrdup(encoding); + + enc = xmlParseCharEncoding((const char *) encoding); + /* + * registered set of known encodings + */ + if (enc != XML_CHAR_ENCODING_ERROR) { + xmlSwitchEncoding(ctxt, enc); + } else { + /* + * fallback for unknown encodings + */ + handler = xmlFindCharEncodingHandler((const char *) encoding); + if (handler != NULL) { + xmlSwitchToEncoding(ctxt, handler); + } else { + ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + } + } + } +} + +/** + * htmlCheckMeta: + * @ctxt: an HTML parser context + * @atts: the attributes values + * + * Checks an attributes from a Meta tag + */ +void +htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { + int i; + const xmlChar *att, *value; + int http = 0; + const xmlChar *content = NULL; + + if ((ctxt == NULL) || (atts == NULL)) + return; + + i = 0; + att = atts[i++]; + while (att != NULL) { + value = atts[i++]; + if ((value != NULL) && + ((!xmlStrcmp(att, BAD_CAST"http-equiv")) || + (!xmlStrcmp(att, BAD_CAST"Http-Equiv")) || + (!xmlStrcmp(att, BAD_CAST"HTTP-EQUIV"))) && + ((!xmlStrcmp(value, BAD_CAST"Content-Type")) || + (!xmlStrcmp(value, BAD_CAST"content-type")) || + (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE")))) + http = 1; + else if ((value != NULL) && + ((!xmlStrcmp(att, BAD_CAST"content")) || + (!xmlStrcmp(att, BAD_CAST"Content")) || + (!xmlStrcmp(att, BAD_CAST"CONTENT")))) + content = value; + att = atts[i++]; + } + if ((http) && (content != NULL)) + htmlCheckEncoding(ctxt, content); + +} + /** * htmlParseStartTag: * @ctxt: an HTML parser context @@ -2254,6 +2246,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { const xmlChar **atts = NULL; int nbatts = 0; int maxatts = 0; + int meta = 0; int i; if (CUR != '<') return; @@ -2268,6 +2261,8 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { ctxt->wellFormed = 0; return; } + if (!xmlStrcmp(name, BAD_CAST"meta")) + meta = 1; /* * Check for auto-closure of HTML elements. @@ -2293,6 +2288,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { GROW; attname = htmlParseAttribute(ctxt, &attvalue); if (attname != NULL) { + /* * Well formedness requires at most one declaration of an attribute */ @@ -2349,6 +2345,12 @@ failed: } } + /* + * Handle specific association to the META tag + */ + if (meta) + htmlCheckMeta(ctxt, atts); + /* * SAX: Start of Element ! */ @@ -2587,6 +2589,13 @@ htmlParseContent(htmlParserCtxtPtr ctxt) { htmlParseReference(ctxt); } + /* + * Fourth : end of the resource + */ + else if (CUR == 0) { + htmlAutoClose(ctxt, NULL); + } + /* * Last case, text. Note that References are handled directly. */ @@ -2851,6 +2860,13 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { */ htmlParseContent(ctxt); + /* + * autoclose + */ + if (CUR == 0) + htmlAutoClose(ctxt, NULL); + + /* * SAX: end of the document processing. */ @@ -2942,26 +2958,7 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt) void htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) { - htmlParserInputPtr input; - xmlChar *oldname; - - if (ctxt == NULL) return; - - while ((input = inputPop(ctxt)) != NULL) { - xmlFreeInputStream(input); - } - - if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab); - while ((oldname = htmlnamePop(ctxt)) != NULL) { - xmlFree(oldname); - } - if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab); - if (ctxt->directory != NULL) xmlFree(ctxt->directory); - if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab); - if (ctxt->version != NULL) xmlFree((char *) ctxt->version); - if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler)) - xmlFree(ctxt->sax); - xmlFree(ctxt); + xmlFreeParserCtxt(ctxt); } /** @@ -3096,7 +3093,7 @@ int htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { int ret = 0; htmlParserInputPtr in; - int avail; + int avail = 0; xmlChar cur, next; #ifdef DEBUG_PUSH @@ -3142,6 +3139,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { avail = in->length - (in->cur - in->base); else avail = in->buf->buffer->use - (in->cur - in->base); + if ((avail == 0) && (terminate)) { + htmlAutoClose(ctxt, NULL); + if (ctxt->nameNr == 0) + ctxt->instate = XML_PARSER_EOF; + } if (avail < 1) goto done; switch (ctxt->instate) { @@ -3445,14 +3447,26 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { * Handle preparsed entities and charRef */ if (ctxt->token != 0) { - xmlChar cur[2] = { 0 , 0 } ; + xmlChar chr[2] = { 0 , 0 } ; - cur[0] = (xmlChar) ctxt->token; + chr[0] = (xmlChar) ctxt->token; if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) - ctxt->sax->characters(ctxt->userData, cur, 1); + ctxt->sax->characters(ctxt->userData, chr, 1); ctxt->token = 0; ctxt->checkIndex = 0; } + if ((avail == 1) && (terminate)) { + cur = in->cur[0]; + if ((cur != '<') && (cur != '&')) { + if ((ctxt->sax != NULL) && + (ctxt->sax->characters != NULL)) + ctxt->sax->characters(ctxt->userData, &cur, 1); + ctxt->token = 0; + ctxt->checkIndex = 0; + NEXT; + } + break; + } if (avail < 2) goto done; cur = in->cur[0]; @@ -3588,6 +3602,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { } } done: + if ((avail == 0) && (terminate)) { + htmlAutoClose(ctxt, NULL); + if (ctxt->nameNr == 0) + ctxt->instate = XML_PARSER_EOF; + } #ifdef DEBUG_PUSH fprintf(stderr, "HPP: done %d\n", ret); #endif diff --git a/doc/xml.html b/doc/xml.html index 84a6aaf1..02d9c98a 100644 --- a/doc/xml.html +++ b/doc/xml.html @@ -3,7 +3,7 @@ The XML library for Gnome - + @@ -75,9 +75,9 @@ building tag-based structured documents/data.

There are some on-line resources about using libxml:

    -
  1. The code is commented in a way which allows extensive documentation to be - automatically extracted.
  2. +
  3. Check the FAQ
  4. +
  5. Check the extensive + documentation automatically extracted from code comments.
  6. This page provides a global overview and some examples on how to use libxml.
  7. James Henstridge wrote
    CVSROOT=:pserver:anonymous@dev.w3.org:/sources/public
    -    password: anonymous
    -    module: XML
    + password: anonymous + module: XML
  8. The Gnome @@ -1180,6 +1180,6 @@ base under gnome-xml/example

    Daniel Veillard

    -

    $Id: xml.html,v 1.34 2000/06/23 18:32:15 veillard Exp $

    +

    $Id: xml.html,v 1.35 2000/06/30 17:58:22 veillard Exp $

    diff --git a/encoding.c b/encoding.c index d1897991..dd367201 100644 --- a/encoding.c +++ b/encoding.c @@ -1167,7 +1167,7 @@ xmlFindCharEncodingHandler(const char *name) { iconv_close(icv_out); return(NULL); } - enc->name = NULL; + enc->name = xmlMemStrdup(name); enc->input = NULL; enc->output = NULL; enc->iconv_in = icv_in;