From a6955c13c73b0800a00d263b02ebdbb4df9c5479 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Sun, 8 Sep 2024 23:19:49 +0200 Subject: [PATCH] html: Parse numeric character references according to HTML5 --- HTMLparser.c | 121 +++++++++++++++++++++++++++------------------------ 1 file changed, 63 insertions(+), 58 deletions(-) diff --git a/HTMLparser.c b/HTMLparser.c index cdebaf67..be99b936 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -3443,6 +3443,13 @@ done: return; } +static const short htmlC1Remap[32] = { + 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, + 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 +}; + /** * htmlParseCharRef: * @ctxt: an HTML parser context @@ -3462,63 +3469,57 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) { if ((ctxt == NULL) || (ctxt->input == NULL)) return(0); + if ((CUR == '&') && (NXT(1) == '#') && ((NXT(2) == 'x') || NXT(2) == 'X')) { SKIP(3); - while (CUR != ';') { - if ((CUR >= '0') && (CUR <= '9')) { - if (val < 0x110000) - val = val * 16 + (CUR - '0'); - } else if ((CUR >= 'a') && (CUR <= 'f')) { - if (val < 0x110000) - val = val * 16 + (CUR - 'a') + 10; - } else if ((CUR >= 'A') && (CUR <= 'F')) { - if (val < 0x110000) - val = val * 16 + (CUR - 'A') + 10; + while (1) { + int c = CUR; + + if ((c >= '0') && (c <= '9')) { + c -= '0'; + } else if ((c >= 'a') && (c <= 'f')) { + c = (c - 'a') + 10; + } else if ((c >= 'A') && (c <= 'F')) { + c = (c - 'A') + 10; } else { - htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, - "htmlParseCharRef: missing semicolon\n", - NULL, NULL); break; } + val = val * 16 + c; + if (val >= 0x110000) + val = 0x110000; NEXT; } if (CUR == ';') SKIP(1); } else if ((CUR == '&') && (NXT(1) == '#')) { SKIP(2); - while (CUR != ';') { - if ((CUR >= '0') && (CUR <= '9')) { - if (val < 0x110000) - val = val * 10 + (CUR - '0'); - } else { - htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, - "htmlParseCharRef: missing semicolon\n", - NULL, NULL); - break; - } + while (1) { + int c = CUR; + + if ((c < '0') || (c > '9')) + break; + val = val * 10 + (c - '0'); + if (val >= 0x110000) + val = 0x110000; NEXT; } if (CUR == ';') SKIP(1); - } else { - htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF, - "htmlParseCharRef: invalid value\n", NULL, NULL); } + /* - * Check the value IS_CHAR ... + * Remap C1 control characters */ - if (IS_CHAR(val)) { - return(val); - } else if (val >= 0x110000) { - htmlParseErr(ctxt, XML_ERR_INVALID_CHAR, - "htmlParseCharRef: value too large\n", NULL, NULL); - } else { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "htmlParseCharRef: invalid xmlChar value %d\n", - val); + if ((val >= 0x80) && (val < 0xA0)) { + val = htmlC1Remap[val - 0x80]; + } else if ((val <= 0) || + ((val >= 0xD800) && (val < 0xE000)) || + (val > 0x10FFFF)) { + val = 0xFFFD; } - return(0); + + return(val); } @@ -4070,10 +4071,15 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) */ static void htmlParseReference(htmlParserCtxtPtr ctxt) { + const xmlChar *repl = NULL; + int replLen = 0; xmlChar out[6]; - if (CUR != '&') return; - if (NXT(1) == '#') { + if ((NXT(1) == '#') && + ((IS_ASCII_DIGIT(NXT(2))) || + ((UPP(2) == 'X') && + ((IS_ASCII_DIGIT(NXT(3))) || + ((UPP(3) >= 'A') && (UPP(3) <= 'F')))))) { unsigned int c; int bits, i = 0; @@ -4091,30 +4097,29 @@ htmlParseReference(htmlParserCtxtPtr ctxt) { } out[i] = 0; - htmlCheckParagraph(ctxt); - if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) - ctxt->sax->characters(ctxt->userData, out, i); - } else { - const xmlChar *repl; - int nameLen, replLen; + repl = out; + replLen = i; + } else if (IS_ASCII_LETTER(NXT(1))) { + int nameLen; - htmlCheckParagraph(ctxt); - - SKIP(1); - repl = htmlFindEntityPrefix(CUR_PTR, - ctxt->input->end - CUR_PTR, + repl = htmlFindEntityPrefix(CUR_PTR + 1, + ctxt->input->end - CUR_PTR - 1, /* isAttr */ 0, &nameLen, &replLen); - if (repl == NULL) { - if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) - ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); - } else { - if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) - ctxt->sax->characters(ctxt->userData, repl, replLen); - SKIP(nameLen); - } + if (repl != NULL) + SKIP(nameLen + 1); } + + if (repl == NULL) { + repl = BAD_CAST "&"; + replLen = 1; + SKIP(1); + } + + htmlCheckParagraph(ctxt); + if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) + ctxt->sax->characters(ctxt->userData, repl, replLen); } /**