1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-29 11:41:22 +03:00

html: Support encoding auto-detection in push parser

Align with pull parser.
This commit is contained in:
Nick Wellnhofer
2025-01-31 21:05:22 +01:00
parent 641fb1acf5
commit 227d8f739b
2 changed files with 16 additions and 2 deletions

View File

@ -4935,6 +4935,14 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
*/ */
goto done; goto done;
case XML_PARSER_START: case XML_PARSER_START:
/*
* Very first chars read from the document flow.
*/
if ((!terminate) && (avail < 4))
goto done;
xmlDetectEncoding(ctxt);
/* /*
* This is wrong but matches long-standing behavior. In most * This is wrong but matches long-standing behavior. In most
* cases, a document starting with an XML declaration will * cases, a document starting with an XML declaration will
@ -4945,6 +4953,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8); xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
} }
/* fall through */
case XML_PARSER_XML_DECL:
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) { if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
ctxt->sax->setDocumentLocator(ctxt->userData, ctxt->sax->setDocumentLocator(ctxt->userData,
(xmlSAXLocator *) &xmlDefaultSAXLocator); (xmlSAXLocator *) &xmlDefaultSAXLocator);
@ -4953,8 +4964,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(!ctxt->disableSAX)) (!ctxt->disableSAX))
ctxt->sax->startDocument(ctxt->userData); ctxt->sax->startDocument(ctxt->userData);
/* Allow callback to modify state */ /* Allow callback to modify state for tests */
if (ctxt->instate == XML_PARSER_START) if ((ctxt->instate == XML_PARSER_START) ||
(ctxt->instate == XML_PARSER_XML_DECL))
ctxt->instate = XML_PARSER_MISC; ctxt->instate = XML_PARSER_MISC;
break; break;
case XML_PARSER_START_TAG: { case XML_PARSER_START_TAG: {

View File

@ -1797,6 +1797,8 @@ htmlTokenizerTest(const char *filename, const char *result,
config.startTag = BAD_CAST startTag; config.startTag = BAD_CAST startTag;
config.inCharacters = 0; config.inCharacters = 0;
ctxt->_private = &config; ctxt->_private = &config;
/* Skip charset auto-detection */
ctxt->instate = XML_PARSER_XML_DECL;
htmlCtxtUseOptions(ctxt, options | HTML_PARSE_HTML5); htmlCtxtUseOptions(ctxt, options | HTML_PARSE_HTML5);
htmlParseChunk(ctxt, data, size, 1); htmlParseChunk(ctxt, data, size, 1);
htmlFreeParserCtxt(ctxt); htmlFreeParserCtxt(ctxt);