From a19fa11e1d6c2b824f873b5be3786fc92380dd8f Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Thu, 13 Apr 2023 15:11:47 +0200 Subject: [PATCH] parser: Fix regression when switching input encodings Revert some changes from commit 98840d40. WebKit/Chromium can actually switch from ISO-8859-1 to UTF-16 in the middle of parsing. This is a bad idea, but we have to keep supporting this use case. --- parserInternals.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/parserInternals.c b/parserInternals.c index 084de4a0..b8f6648b 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -1177,12 +1177,20 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, } if (in->encoder != NULL) { + if (in->encoder == handler) + return (0); + /* - * TODO: Detect encoding mismatch. We should start by comparing - * in->encoder->name and handler->name, but there are a few - * compatible encodings like UTF-16 and UCS-2 or UTF-32 and UCS-4. + * Switching encodings during parsing is a really bad idea, + * but WebKit/Chromium switches from ISO-8859-1 to UTF-16 as soon as + * it finds Unicode characters with code points larger than 255. + * + * TODO: We should check whether the "raw" input buffer is empty and + * convert the old content using the old encoder. */ - xmlCharEncCloseFunc(handler); + + xmlCharEncCloseFunc(in->encoder); + in->encoder = handler; return (0); }