1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-24 13:33:01 +03:00

Document support for the non-standard escape sequences.

Support non-BMP code points in surrogate pairs of '\uXXXX\uXXXX'.
This commit is contained in:
Damjan Jovanovic
2021-05-31 07:45:18 +02:00
committed by Nick Wellnhofer
parent b66c19612c
commit 37ebf8a8b2

View File

@@ -4907,6 +4907,47 @@ xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
}
}
static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)
{
int val = 0, i, cur;
for (i = 0; i < 4; i++) {
NEXT;
val *= 16;
cur = CUR;
if (cur >= '0' && cur <= '9') {
val += cur - '0';
} else if (cur >= 'A' && cur <= 'F') {
val += cur - 'A' + 10;
} else if (cur >= 'a' && cur <= 'f') {
val += cur - 'a' + 10;
} else {
ERROR("Expecting hex digit");
return -1;
}
}
return val;
}
static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)
{
int val = parse_escaped_codeunit(ctxt);
if (0xD800 <= val && val <= 0xDBFF) {
NEXT;
if (CUR == '\\') {
NEXT;
if (CUR == 'u') {
int low = parse_escaped_codeunit(ctxt);
if (0xDC00 <= low && low <= 0xDFFF) {
return (val - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
}
}
}
ERROR("Invalid low surrogate pair code unit");
val = -1;
}
return val;
}
/**
* xmlFAParseCharClassEsc:
* @ctxt: a regexp parser context
@@ -4969,10 +5010,25 @@ xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
(cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
(cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
(cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
(cur == 0x5E) || (cur == '!') || (cur == '"') || (cur == '#') ||
(cur == '$') || (cur == '%') || (cur == ',') || (cur == '/') ||
(cur == ':') || (cur == ';') || (cur == '=') || (cur == '>') ||
(cur == '@') || (cur == '`') || (cur == '~') || (cur == 'u')) {
(cur == 0x5E) ||
/* Non-standard escape sequences:
* Java 1.8|.NET Core 3.1|MSXML 6 */
(cur == '!') || /* + | + | + */
(cur == '"') || /* + | + | + */
(cur == '#') || /* + | + | + */
(cur == '$') || /* + | + | + */
(cur == '%') || /* + | + | + */
(cur == ',') || /* + | + | + */
(cur == '/') || /* + | + | + */
(cur == ':') || /* + | + | + */
(cur == ';') || /* + | + | + */
(cur == '=') || /* + | + | + */
(cur == '>') || /* | + | + */
(cur == '@') || /* + | + | + */
(cur == '`') || /* + | + | + */
(cur == '~') || /* + | + | + */
(cur == 'u')) { /* | + | + */
if (ctxt->atom == NULL) {
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
if (ctxt->atom != NULL) {
@@ -4987,21 +5043,12 @@ xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
ctxt->atom->codepoint = '\t';
break;
case 'u':
{
char hex_buffer[5];
int loop;
for (loop = 0; loop < 4; loop++) {
NEXT;
if (!('0' <= CUR && CUR <= '9') && !('a' <= CUR && CUR <= 'f') && !('A' <= CUR && CUR <= 'F')) {
ERROR("Expecting hex digit");
cur = parse_escaped_codepoint(ctxt);
if (cur < 0) {
return;
}
hex_buffer[loop] = CUR;
}
hex_buffer[4] = 0;
ctxt->atom->codepoint = (int)strtoul(hex_buffer, NULL, 16);
ctxt->atom->codepoint = cur;
break;
}
default:
ctxt->atom->codepoint = cur;
}