diff --git a/xmlregexp.c b/xmlregexp.c index f1366fd4..f5afa45d 100644 --- a/xmlregexp.c +++ b/xmlregexp.c @@ -4907,6 +4907,47 @@ xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { } } +static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt) +{ + int val = 0, i, cur; + for (i = 0; i < 4; i++) { + NEXT; + val *= 16; + cur = CUR; + if (cur >= '0' && cur <= '9') { + val += cur - '0'; + } else if (cur >= 'A' && cur <= 'F') { + val += cur - 'A' + 10; + } else if (cur >= 'a' && cur <= 'f') { + val += cur - 'a' + 10; + } else { + ERROR("Expecting hex digit"); + return -1; + } + } + return val; +} + +static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt) +{ + int val = parse_escaped_codeunit(ctxt); + if (0xD800 <= val && val <= 0xDBFF) { + NEXT; + if (CUR == '\\') { + NEXT; + if (CUR == 'u') { + int low = parse_escaped_codeunit(ctxt); + if (0xDC00 <= low && low <= 0xDFFF) { + return (val - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000; + } + } + } + ERROR("Invalid low surrogate pair code unit"); + val = -1; + } + return val; +} + /** * xmlFAParseCharClassEsc: * @ctxt: a regexp parser context @@ -4969,10 +5010,25 @@ xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || - (cur == 0x5E) || (cur == '!') || (cur == '"') || (cur == '#') || - (cur == '$') || (cur == '%') || (cur == ',') || (cur == '/') || - (cur == ':') || (cur == ';') || (cur == '=') || (cur == '>') || - (cur == '@') || (cur == '`') || (cur == '~') || (cur == 'u')) { + (cur == 0x5E) || + + /* Non-standard escape sequences: + * Java 1.8|.NET Core 3.1|MSXML 6 */ + (cur == '!') || /* + | + | + */ + (cur == '"') || /* + | + | + */ + (cur == '#') || /* + | + | + */ + (cur == '$') || /* + | + | + */ + (cur == '%') || /* + | + | + */ + (cur == ',') || /* + | + | + */ + (cur == '/') || /* + | + | + */ + (cur == ':') || /* + | + | + */ + (cur == ';') || /* + | + | + */ + (cur == '=') || /* + | + | + */ + (cur == '>') || /* | + | + */ + (cur == '@') || /* + | + | + */ + (cur == '`') || /* + | + | + */ + (cur == '~') || /* + | + | + */ + (cur == 'u')) { /* | + | + */ if (ctxt->atom == NULL) { ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); if (ctxt->atom != NULL) { @@ -4987,21 +5043,12 @@ xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { ctxt->atom->codepoint = '\t'; break; case 'u': - { - char hex_buffer[5]; - int loop; - for (loop = 0; loop < 4; loop++) { - NEXT; - if (!('0' <= CUR && CUR <= '9') && !('a' <= CUR && CUR <= 'f') && !('A' <= CUR && CUR <= 'F')) { - ERROR("Expecting hex digit"); - return; - } - hex_buffer[loop] = CUR; + cur = parse_escaped_codepoint(ctxt); + if (cur < 0) { + return; } - hex_buffer[4] = 0; - ctxt->atom->codepoint = (int)strtoul(hex_buffer, NULL, 16); + ctxt->atom->codepoint = cur; break; - } default: ctxt->atom->codepoint = cur; }