1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-30 22:43:14 +03:00

html: Handle numeric character references directly

This commit is contained in:
Nick Wellnhofer
2024-09-26 17:09:40 +02:00
parent 0bc4608c50
commit 207999793f

View File

@ -2452,84 +2452,19 @@ static const short htmlC1Remap[32] = {
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
};
static int
htmlParseNCR(const xmlChar *string, size_t slen, int *dlen) {
const xmlChar *in = string;
const xmlChar *end = string + slen;
unsigned val = 0;
while (in < end) {
int c = *in;
if ((c < '0') || (c > '9'))
break;
val = val * 10 + (c - '0');
if (val >= 0x110000)
val = 0x110000;
in += 1;
}
if (*in == ';')
in += 1;
if ((val >= 0x80) && (val < 0xA0)) {
val = htmlC1Remap[val - 0x80];
} else if ((val <= 0) ||
((val >= 0xD800) && (val < 0xE000)) ||
(val > 0x10FFFF)) {
val = 0xFFFD;
}
*dlen = in - string;
return(val);
}
static int
htmlParseNCRHex(const xmlChar *string, size_t slen, int *dlen) {
const xmlChar *in = string;
const xmlChar *end = string + slen;
unsigned val = 0;
while (in < end) {
int c = *in | 0x20;
if ((c >= '0') && (c <= '9')) {
c -= '0';
} else if ((c >= 'a') && (c <= 'f')) {
c = (c - 'a') + 10;
} else {
break;
}
val = val * 16 + c;
if (val >= 0x110000)
val = 0x110000;
in += 1;
}
if (*in == ';')
in += 1;
if ((val >= 0x80) && (val < 0xA0)) {
val = htmlC1Remap[val - 0x80];
} else if ((val <= 0) ||
((val >= 0xD800) && (val < 0xE000)) ||
(val > 0x10FFFF)) {
val = 0xFFFD;
}
*dlen = in - string;
return(val);
}
static const xmlChar *
htmlCodePointToUtf8(int c, xmlChar *out, int *osize) {
int i = 0;
int bits, hi;
if ((c >= 0x80) && (c < 0xA0)) {
c = htmlC1Remap[c - 0x80];
} else if ((c <= 0) ||
((c >= 0xD800) && (c < 0xE000)) ||
(c > 0x10FFFF)) {
c = 0xFFFD;
}
if (c < 0x80) { bits = 0; hi = 0x00; }
else if (c < 0x800) { bits = 6; hi = 0xC0; }
else if (c < 0x10000) { bits = 12; hi = 0xE0; }
@ -2656,7 +2591,7 @@ htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr,
* @refs: true if references are allowed
* @maxLength: maximum output length
*
* Parse data until callback signals to stop.
* Parse data until terminator is reached.
*
* Returns the parsed string or NULL in case of errors.
*/
@ -2694,6 +2629,9 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
size_t avail, chunkSize, extraSize;
int replSize;
int skip = 0;
int ncr = 0;
int ncrSize = 0;
int cp = 0;
chunk = input->cur;
avail = input->end - chunk;
@ -2704,7 +2642,7 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
while (!PARSER_STOPPED(ctxt)) {
size_t j;
int cur, size, cp;
int cur, size;
if ((!eof) && (avail <= 64)) {
size_t oldAvail = avail;
@ -2757,12 +2695,37 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
}
}
if (ncr) {
int lc = cur | 0x20;
int digit;
if ((cur >= '0') && (cur <= '9')) {
digit = cur - '0';
} else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
digit = (lc - 'a') + 10;
} else {
if (cur == ';') {
in += 1;
size += 1;
ncrSize += 1;
}
goto next_chunk;
}
cp = cp * ncr + digit;
if (cp >= 0x110000)
cp = 0x110000;
ncrSize += 1;
goto next_char;
}
switch (cur) {
case '&':
if (!refs)
break;
cp = 0;
j = 1;
if ((j < avail) && (in[j] == '#')) {
@ -2771,21 +2734,18 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
if ((in[j] | 0x20) == 'x') {
j += 1;
if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
cp = htmlParseNCRHex(in + j, avail - j, &skip);
skip += 3;
ncr = 16;
size = 3;
ncrSize = 3;
cp = 0;
}
} else if (IS_ASCII_DIGIT(in[j])) {
cp = htmlParseNCR(in + j, avail - j, &skip);
skip += 2;
ncr = 10;
size = 2;
ncrSize = 2;
cp = 0;
}
}
if (cp > 0) {
repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
goto next_chunk;
}
skip = 0;
} else {
repl = htmlFindEntityPrefix(in + j,
avail - j,
@ -2851,11 +2811,19 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
break;
}
next_char:
in += size;
avail -= size;
}
next_chunk:
if (ncrSize > 0) {
skip = ncrSize;
in -= ncrSize;
repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
}
chunkSize = in - chunk;
extraSize = chunkSize + replSize;
@ -3028,6 +2996,9 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
size_t avail;
int replSize;
int skip = 0;
int ncr = 0;
int ncrSize = 0;
int cp = 0;
chunk = input->cur;
avail = input->end - chunk;
@ -3038,7 +3009,7 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
while (!PARSER_STOPPED(ctxt)) {
size_t j;
int cur, size, cp;
int cur, size;
if (avail <= 64) {
if (!eof) {
@ -3068,6 +3039,32 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
size = 1;
col += 1;
if (ncr) {
int lc = cur | 0x20;
int digit;
if ((cur >= '0') && (cur <= '9')) {
digit = cur - '0';
} else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
digit = (lc - 'a') + 10;
} else {
if (cur == ';') {
in += 1;
size += 1;
ncrSize += 1;
}
goto next_chunk;
}
cp = cp * ncr + digit;
if (cp >= 0x110000)
cp = 0x110000;
ncrSize += 1;
goto next_char;
}
switch (cur) {
case '<':
if (mode == 0) {
@ -3155,7 +3152,6 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
if ((mode != 0) && (mode != DATA_RCDATA))
break;
cp = 0;
j = 1;
if ((j < avail) && (in[j] == '#')) {
@ -3164,21 +3160,18 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
if ((in[j] | 0x20) == 'x') {
j += 1;
if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
cp = htmlParseNCRHex(in + j, avail - j, &skip);
skip += 3;
ncr = 16;
size = 3;
ncrSize = 3;
cp = 0;
}
} else if (IS_ASCII_DIGIT(in[j])) {
cp = htmlParseNCR(in + j, avail - j, &skip);
skip += 2;
ncr = 10;
size = 2;
ncrSize = 2;
cp = 0;
}
}
if (cp > 0) {
repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
goto next_chunk;
}
skip = 0;
} else {
repl = htmlFindEntityPrefix(in + j,
avail - j,
@ -3244,11 +3237,19 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
break;
}
next_char:
in += size;
avail -= size;
}
next_chunk:
if (ncrSize > 0) {
skip = ncrSize;
in -= ncrSize;
repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
}
if (in > chunk) {
input->cur += in - chunk;
htmlCharDataSAXCallback(ctxt, chunk, in - chunk, mode);