1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-29 11:41:22 +03:00

Work on character encoding support for the HTML parser

Fixed some autoopen/autoclose probs for the HTML parser
Fixed a potential memleak in the encoding stuff
Daniel.
This commit is contained in:
Daniel Veillard
2000-07-02 07:56:37 +00:00
parent af743793c8
commit 365e13be6c
4 changed files with 193 additions and 168 deletions

View File

@ -1,3 +1,9 @@
Sun Jul 2 09:52:45 MEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
* HTMLparser.c: Work on character encoding support for the HTML parser
* HTMLparser.c: Fixed some autoopen/autoclose probs for the HTML parser
* encoding.c: Fixed a potential memleak in the encoding stuff
Sat Jul 1 13:44:22 MEST 2000 Daniel Veillard <Daniel.Veillard@w3.org> Sat Jul 1 13:44:22 MEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
* doc/FAQ.html doc/Makefile.am : added a FAQ * doc/FAQ.html doc/Makefile.am : added a FAQ

View File

@ -479,17 +479,19 @@ htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) {
/** /**
* htmlAutoClose: * htmlAutoClose:
* @ctxt: an HTML parser context * @ctxt: an HTML parser context
* @new: The new tag name * @new: The new tag name or NULL
* *
* The HTmL DtD allows a tag to implicitely close other tags. * The HTmL DtD allows a tag to implicitely close other tags.
* The list is kept in htmlStartClose array. This function is * The list is kept in htmlStartClose array. This function is
* called when a new tag has been detected and generates the * called when a new tag has been detected and generates the
* appropriates closes if possible/needed. * appropriates closes if possible/needed.
* If new is NULL this mean we are at the end of the resource
* and we should check
*/ */
void void
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) { htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
xmlChar *oldname; xmlChar *oldname;
while ((ctxt->name != NULL) && while ((new != NULL) && (ctxt->name != NULL) &&
(htmlCheckAutoClose(new, ctxt->name))) { (htmlCheckAutoClose(new, ctxt->name))) {
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr,"htmlAutoClose: %s closes %s\n", new, ctxt->name); fprintf(stderr,"htmlAutoClose: %s closes %s\n", new, ctxt->name);
@ -504,6 +506,24 @@ htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
xmlFree(oldname); xmlFree(oldname);
} }
} }
while ((new == NULL) && (ctxt->name != NULL) &&
((!xmlStrcmp(ctxt->name, BAD_CAST"head")) ||
(!xmlStrcmp(ctxt->name, BAD_CAST"body")) ||
(!xmlStrcmp(ctxt->name, BAD_CAST"html")))) {
#ifdef DEBUG
fprintf(stderr,"htmlAutoClose: EOF closes %s\n", ctxt->name);
#endif
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, ctxt->name);
oldname = htmlnamePop(ctxt);
if (oldname != NULL) {
#ifdef DEBUG
fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
#endif
xmlFree(oldname);
}
}
} }
/** /**
@ -1086,140 +1106,6 @@ htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
return(buffer); return(buffer);
} }
/************************************************************************
* *
* Commodity functions to handle encodings *
* *
************************************************************************/
/**
* htmlSwitchEncoding:
* @ctxt: the parser context
* @len: the len of @cur
*
* change the input functions when discovering the character encoding
* of a given entity.
*
*/
void
htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
{
switch (enc) {
case XML_CHAR_ENCODING_ERROR:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, "encoding unknown\n");
ctxt->wellFormed = 0;
break;
case XML_CHAR_ENCODING_NONE:
/* let's assume it's UTF-8 without the XML decl */
return;
case XML_CHAR_ENCODING_UTF8:
/* default encoding, no conversion should be needed */
return;
case XML_CHAR_ENCODING_UTF16LE:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding UTF16 little endian not supported\n");
break;
case XML_CHAR_ENCODING_UTF16BE:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding UTF16 big endian not supported\n");
break;
case XML_CHAR_ENCODING_UCS4LE:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding USC4 little endian not supported\n");
break;
case XML_CHAR_ENCODING_UCS4BE:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding USC4 big endian not supported\n");
break;
case XML_CHAR_ENCODING_EBCDIC:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding EBCDIC not supported\n");
break;
case XML_CHAR_ENCODING_UCS4_2143:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding UCS4 2143 not supported\n");
break;
case XML_CHAR_ENCODING_UCS4_3412:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding UCS4 3412 not supported\n");
break;
case XML_CHAR_ENCODING_UCS2:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding UCS2 not supported\n");
break;
case XML_CHAR_ENCODING_8859_1:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_1 ISO Latin 1 not supported\n");
break;
case XML_CHAR_ENCODING_8859_2:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_2 ISO Latin 2 not supported\n");
break;
case XML_CHAR_ENCODING_8859_3:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_3 not supported\n");
break;
case XML_CHAR_ENCODING_8859_4:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_4 not supported\n");
break;
case XML_CHAR_ENCODING_8859_5:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_5 not supported\n");
break;
case XML_CHAR_ENCODING_8859_6:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_6 not supported\n");
break;
case XML_CHAR_ENCODING_8859_7:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_7 not supported\n");
break;
case XML_CHAR_ENCODING_8859_8:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_8 not supported\n");
break;
case XML_CHAR_ENCODING_8859_9:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_9 not supported\n");
break;
case XML_CHAR_ENCODING_2022_JP:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO-2022-JPnot supported\n");
break;
case XML_CHAR_ENCODING_SHIFT_JIS:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding Shift_JISnot supported\n");
break;
case XML_CHAR_ENCODING_EUC_JP:
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding EUC-JPnot supported\n");
break;
}
}
/************************************************************************ /************************************************************************
* * * *
* Commodity functions to handle streams * * Commodity functions to handle streams *
@ -2227,6 +2113,112 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
return(name); return(name);
} }
/**
* htmlCheckEncoding:
* @ctxt: an HTML parser context
* @attvalue: the attribute value
*
* Checks an http-equiv attribute from a Meta tag to detect
* the encoding
* If a new encoding is detected the parser is switched to decode
* it and pass UTF8
*/
void
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
const xmlChar *encoding;
if ((ctxt == NULL) || (attvalue == NULL))
return;
fprintf(stderr, "htmlCheckEncoding: \"%s\"\n", attvalue);
encoding = xmlStrstr(attvalue, BAD_CAST"charset=");
if (encoding == NULL)
encoding = xmlStrstr(attvalue, BAD_CAST"Charset=");
if (encoding == NULL)
encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET=");
if (encoding != NULL) {
encoding += 8;
} else {
encoding = xmlStrstr(attvalue, BAD_CAST"charset =");
if (encoding == NULL)
encoding = xmlStrstr(attvalue, BAD_CAST"Charset =");
if (encoding == NULL)
encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET =");
if (encoding != NULL)
encoding += 9;
}
if (encoding != NULL) {
xmlCharEncoding enc;
xmlCharEncodingHandlerPtr handler;
while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
if (ctxt->input->encoding != NULL)
xmlFree((xmlChar *) ctxt->input->encoding);
ctxt->input->encoding = xmlStrdup(encoding);
enc = xmlParseCharEncoding((const char *) encoding);
/*
* registered set of known encodings
*/
if (enc != XML_CHAR_ENCODING_ERROR) {
xmlSwitchEncoding(ctxt, enc);
} else {
/*
* fallback for unknown encodings
*/
handler = xmlFindCharEncodingHandler((const char *) encoding);
if (handler != NULL) {
xmlSwitchToEncoding(ctxt, handler);
} else {
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
}
}
}
}
/**
* htmlCheckMeta:
* @ctxt: an HTML parser context
* @atts: the attributes values
*
* Checks an attributes from a Meta tag
*/
void
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
int i;
const xmlChar *att, *value;
int http = 0;
const xmlChar *content = NULL;
if ((ctxt == NULL) || (atts == NULL))
return;
i = 0;
att = atts[i++];
while (att != NULL) {
value = atts[i++];
if ((value != NULL) &&
((!xmlStrcmp(att, BAD_CAST"http-equiv")) ||
(!xmlStrcmp(att, BAD_CAST"Http-Equiv")) ||
(!xmlStrcmp(att, BAD_CAST"HTTP-EQUIV"))) &&
((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
(!xmlStrcmp(value, BAD_CAST"content-type")) ||
(!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
http = 1;
else if ((value != NULL) &&
((!xmlStrcmp(att, BAD_CAST"content")) ||
(!xmlStrcmp(att, BAD_CAST"Content")) ||
(!xmlStrcmp(att, BAD_CAST"CONTENT"))))
content = value;
att = atts[i++];
}
if ((http) && (content != NULL))
htmlCheckEncoding(ctxt, content);
}
/** /**
* htmlParseStartTag: * htmlParseStartTag:
* @ctxt: an HTML parser context * @ctxt: an HTML parser context
@ -2254,6 +2246,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
const xmlChar **atts = NULL; const xmlChar **atts = NULL;
int nbatts = 0; int nbatts = 0;
int maxatts = 0; int maxatts = 0;
int meta = 0;
int i; int i;
if (CUR != '<') return; if (CUR != '<') return;
@ -2268,6 +2261,8 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
ctxt->wellFormed = 0; ctxt->wellFormed = 0;
return; return;
} }
if (!xmlStrcmp(name, BAD_CAST"meta"))
meta = 1;
/* /*
* Check for auto-closure of HTML elements. * Check for auto-closure of HTML elements.
@ -2293,6 +2288,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
GROW; GROW;
attname = htmlParseAttribute(ctxt, &attvalue); attname = htmlParseAttribute(ctxt, &attvalue);
if (attname != NULL) { if (attname != NULL) {
/* /*
* Well formedness requires at most one declaration of an attribute * Well formedness requires at most one declaration of an attribute
*/ */
@ -2349,6 +2345,12 @@ failed:
} }
} }
/*
* Handle specific association to the META tag
*/
if (meta)
htmlCheckMeta(ctxt, atts);
/* /*
* SAX: Start of Element ! * SAX: Start of Element !
*/ */
@ -2587,6 +2589,13 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
htmlParseReference(ctxt); htmlParseReference(ctxt);
} }
/*
* Fourth : end of the resource
*/
else if (CUR == 0) {
htmlAutoClose(ctxt, NULL);
}
/* /*
* Last case, text. Note that References are handled directly. * Last case, text. Note that References are handled directly.
*/ */
@ -2851,6 +2860,13 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
*/ */
htmlParseContent(ctxt); htmlParseContent(ctxt);
/*
* autoclose
*/
if (CUR == 0)
htmlAutoClose(ctxt, NULL);
/* /*
* SAX: end of the document processing. * SAX: end of the document processing.
*/ */
@ -2942,26 +2958,7 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
void void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
{ {
htmlParserInputPtr input; xmlFreeParserCtxt(ctxt);
xmlChar *oldname;
if (ctxt == NULL) return;
while ((input = inputPop(ctxt)) != NULL) {
xmlFreeInputStream(input);
}
if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
while ((oldname = htmlnamePop(ctxt)) != NULL) {
xmlFree(oldname);
}
if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
if (ctxt->directory != NULL) xmlFree(ctxt->directory);
if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
xmlFree(ctxt->sax);
xmlFree(ctxt);
} }
/** /**
@ -3096,7 +3093,7 @@ int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
int ret = 0; int ret = 0;
htmlParserInputPtr in; htmlParserInputPtr in;
int avail; int avail = 0;
xmlChar cur, next; xmlChar cur, next;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
@ -3142,6 +3139,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
avail = in->length - (in->cur - in->base); avail = in->length - (in->cur - in->base);
else else
avail = in->buf->buffer->use - (in->cur - in->base); avail = in->buf->buffer->use - (in->cur - in->base);
if ((avail == 0) && (terminate)) {
htmlAutoClose(ctxt, NULL);
if (ctxt->nameNr == 0)
ctxt->instate = XML_PARSER_EOF;
}
if (avail < 1) if (avail < 1)
goto done; goto done;
switch (ctxt->instate) { switch (ctxt->instate) {
@ -3445,14 +3447,26 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
* Handle preparsed entities and charRef * Handle preparsed entities and charRef
*/ */
if (ctxt->token != 0) { if (ctxt->token != 0) {
xmlChar cur[2] = { 0 , 0 } ; xmlChar chr[2] = { 0 , 0 } ;
cur[0] = (xmlChar) ctxt->token; chr[0] = (xmlChar) ctxt->token;
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, cur, 1); ctxt->sax->characters(ctxt->userData, chr, 1);
ctxt->token = 0; ctxt->token = 0;
ctxt->checkIndex = 0; ctxt->checkIndex = 0;
} }
if ((avail == 1) && (terminate)) {
cur = in->cur[0];
if ((cur != '<') && (cur != '&')) {
if ((ctxt->sax != NULL) &&
(ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, &cur, 1);
ctxt->token = 0;
ctxt->checkIndex = 0;
NEXT;
}
break;
}
if (avail < 2) if (avail < 2)
goto done; goto done;
cur = in->cur[0]; cur = in->cur[0];
@ -3588,6 +3602,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
} }
} }
done: done:
if ((avail == 0) && (terminate)) {
htmlAutoClose(ctxt, NULL);
if (ctxt->nameNr == 0)
ctxt->instate = XML_PARSER_EOF;
}
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
fprintf(stderr, "HPP: done %d\n", ret); fprintf(stderr, "HPP: done %d\n", ret);
#endif #endif

View File

@ -3,7 +3,7 @@
<html> <html>
<head> <head>
<title>The XML library for Gnome</title> <title>The XML library for Gnome</title>
<meta name="GENERATOR" content="amaya V3.1"> <meta name="GENERATOR" content="amaya V2.1">
<meta http-equiv="Content-Type" content="text/html"> <meta http-equiv="Content-Type" content="text/html">
</head> </head>
@ -75,9 +75,9 @@ building tag-based structured documents/data.</p>
<p>There are some on-line resources about using libxml:</p> <p>There are some on-line resources about using libxml:</p>
<ol> <ol>
<li>The code is commented in a way which allows <a <li>Check the <a href="FAQ.html">FAQ</a></li>
href="http://xmlsoft.org/libxml.html">extensive documentation</a> to be <li>Check the <a href="http://xmlsoft.org/libxml.html">extensive
automatically extracted.</li> documentation</a> automatically extracted from code comments.</li>
<li>This page provides a global overview and <a href="#real">some <li>This page provides a global overview and <a href="#real">some
examples</a> on how to use libxml.</li> examples</a> on how to use libxml.</li>
<li><a href="mailto:james@daa.com.au">James Henstridge</a> wrote <a <li><a href="mailto:james@daa.com.au">James Henstridge</a> wrote <a
@ -166,8 +166,8 @@ platform, get in touch with me to upload the package. I will keep them in the
this base for my own development, so it's updated more regularly, but the this base for my own development, so it's updated more regularly, but the
content may not be as stable):</p> content may not be as stable):</p>
<pre>CVSROOT=:pserver:anonymous@dev.w3.org:/sources/public <pre>CVSROOT=:pserver:anonymous@dev.w3.org:/sources/public
password: anonymous password: anonymous
module: XML</pre> module: XML</pre>
</li> </li>
<li><p>The <a <li><p>The <a
href="http://cvs.gnome.org/bonsai/rview.cgi?cvsroot=/cvs/gnome&amp;dir=gnome-xml">Gnome href="http://cvs.gnome.org/bonsai/rview.cgi?cvsroot=/cvs/gnome&amp;dir=gnome-xml">Gnome
@ -1180,6 +1180,6 @@ base under gnome-xml/example</p>
<p><a href="mailto:Daniel.Veillard@w3.org">Daniel Veillard</a></p> <p><a href="mailto:Daniel.Veillard@w3.org">Daniel Veillard</a></p>
<p>$Id: xml.html,v 1.34 2000/06/23 18:32:15 veillard Exp $</p> <p>$Id: xml.html,v 1.35 2000/06/30 17:58:22 veillard Exp $</p>
</body> </body>
</html> </html>

View File

@ -1167,7 +1167,7 @@ xmlFindCharEncodingHandler(const char *name) {
iconv_close(icv_out); iconv_close(icv_out);
return(NULL); return(NULL);
} }
enc->name = NULL; enc->name = xmlMemStrdup(name);
enc->input = NULL; enc->input = NULL;
enc->output = NULL; enc->output = NULL;
enc->iconv_in = icv_in; enc->iconv_in = icv_in;