1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-29 11:41:22 +03:00

html: Adjust xmlDetectEncoding for HTML

Don't check for UTF-32 or EBCDIC.

We now perform BOM sniffing and the first step of the HTML5 prescan
algorithm (detect UTF-16 XML declarations). The rest of the algorithm
still has to be implemented.
This commit is contained in:
Nick Wellnhofer
2025-02-01 14:58:06 +01:00
parent 227d8f739b
commit 6bb2ea8e70
2 changed files with 34 additions and 6 deletions

View File

@ -4346,8 +4346,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
xmlDetectEncoding(ctxt);
/*
* This is wrong but matches long-standing behavior. In most cases,
* a document starting with an XML declaration will specify UTF-8.
* TODO: Implement HTML5 prescan algorithm
*/
/*
* This is wrong but matches long-standing behavior. In most
* cases, a document starting with an XML declaration will
* specify UTF-8. The HTML5 prescan algorithm handles
* XML declarations in a better way.
*/
if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
(xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
@ -4943,10 +4949,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
xmlDetectEncoding(ctxt);
/*
* TODO: Implement HTML5 prescan algorithm
*/
/*
* This is wrong but matches long-standing behavior. In most
* cases, a document starting with an XML declaration will
* specify UTF-8.
* specify UTF-8. The HTML5 prescan algorithm handles
* XML declarations in a better way.
*/
if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
(xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {