diff --git a/HTMLparser.c b/HTMLparser.c
index bc543303..cf7bab5f 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2452,84 +2452,19 @@ static const short htmlC1Remap[32] = {
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
};
-static int
-htmlParseNCR(const xmlChar *string, size_t slen, int *dlen) {
- const xmlChar *in = string;
- const xmlChar *end = string + slen;
- unsigned val = 0;
-
- while (in < end) {
- int c = *in;
-
- if ((c < '0') || (c > '9'))
- break;
- val = val * 10 + (c - '0');
- if (val >= 0x110000)
- val = 0x110000;
-
- in += 1;
- }
-
- if (*in == ';')
- in += 1;
-
- if ((val >= 0x80) && (val < 0xA0)) {
- val = htmlC1Remap[val - 0x80];
- } else if ((val <= 0) ||
- ((val >= 0xD800) && (val < 0xE000)) ||
- (val > 0x10FFFF)) {
- val = 0xFFFD;
- }
-
- *dlen = in - string;
-
- return(val);
-}
-
-static int
-htmlParseNCRHex(const xmlChar *string, size_t slen, int *dlen) {
- const xmlChar *in = string;
- const xmlChar *end = string + slen;
- unsigned val = 0;
-
- while (in < end) {
- int c = *in | 0x20;
-
- if ((c >= '0') && (c <= '9')) {
- c -= '0';
- } else if ((c >= 'a') && (c <= 'f')) {
- c = (c - 'a') + 10;
- } else {
- break;
- }
- val = val * 16 + c;
- if (val >= 0x110000)
- val = 0x110000;
-
- in += 1;
- }
-
- if (*in == ';')
- in += 1;
-
- if ((val >= 0x80) && (val < 0xA0)) {
- val = htmlC1Remap[val - 0x80];
- } else if ((val <= 0) ||
- ((val >= 0xD800) && (val < 0xE000)) ||
- (val > 0x10FFFF)) {
- val = 0xFFFD;
- }
-
- *dlen = in - string;
-
- return(val);
-}
-
static const xmlChar *
htmlCodePointToUtf8(int c, xmlChar *out, int *osize) {
int i = 0;
int bits, hi;
+ if ((c >= 0x80) && (c < 0xA0)) {
+ c = htmlC1Remap[c - 0x80];
+ } else if ((c <= 0) ||
+ ((c >= 0xD800) && (c < 0xE000)) ||
+ (c > 0x10FFFF)) {
+ c = 0xFFFD;
+ }
+
if (c < 0x80) { bits = 0; hi = 0x00; }
else if (c < 0x800) { bits = 6; hi = 0xC0; }
else if (c < 0x10000) { bits = 12; hi = 0xE0; }
@@ -2656,7 +2591,7 @@ htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr,
* @refs: true if references are allowed
* @maxLength: maximum output length
*
- * Parse data until callback signals to stop.
+ * Parse data until terminator is reached.
*
* Returns the parsed string or NULL in case of errors.
*/
@@ -2694,6 +2629,9 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
size_t avail, chunkSize, extraSize;
int replSize;
int skip = 0;
+ int ncr = 0;
+ int ncrSize = 0;
+ int cp = 0;
chunk = input->cur;
avail = input->end - chunk;
@@ -2704,7 +2642,7 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
while (!PARSER_STOPPED(ctxt)) {
size_t j;
- int cur, size, cp;
+ int cur, size;
if ((!eof) && (avail <= 64)) {
size_t oldAvail = avail;
@@ -2757,12 +2695,37 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
}
}
+ if (ncr) {
+ int lc = cur | 0x20;
+ int digit;
+
+ if ((cur >= '0') && (cur <= '9')) {
+ digit = cur - '0';
+ } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
+ digit = (lc - 'a') + 10;
+ } else {
+ if (cur == ';') {
+ in += 1;
+ size += 1;
+ ncrSize += 1;
+ }
+ goto next_chunk;
+ }
+
+ cp = cp * ncr + digit;
+ if (cp >= 0x110000)
+ cp = 0x110000;
+
+ ncrSize += 1;
+
+ goto next_char;
+ }
+
switch (cur) {
case '&':
if (!refs)
break;
- cp = 0;
j = 1;
if ((j < avail) && (in[j] == '#')) {
@@ -2771,21 +2734,18 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
if ((in[j] | 0x20) == 'x') {
j += 1;
if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
- cp = htmlParseNCRHex(in + j, avail - j, &skip);
- skip += 3;
+ ncr = 16;
+ size = 3;
+ ncrSize = 3;
+ cp = 0;
}
} else if (IS_ASCII_DIGIT(in[j])) {
- cp = htmlParseNCR(in + j, avail - j, &skip);
- skip += 2;
+ ncr = 10;
+ size = 2;
+ ncrSize = 2;
+ cp = 0;
}
}
-
- if (cp > 0) {
- repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
- goto next_chunk;
- }
-
- skip = 0;
} else {
repl = htmlFindEntityPrefix(in + j,
avail - j,
@@ -2851,11 +2811,19 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
break;
}
+next_char:
in += size;
avail -= size;
}
next_chunk:
+ if (ncrSize > 0) {
+ skip = ncrSize;
+ in -= ncrSize;
+
+ repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
+ }
+
chunkSize = in - chunk;
extraSize = chunkSize + replSize;
@@ -3028,6 +2996,9 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
size_t avail;
int replSize;
int skip = 0;
+ int ncr = 0;
+ int ncrSize = 0;
+ int cp = 0;
chunk = input->cur;
avail = input->end - chunk;
@@ -3038,7 +3009,7 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
while (!PARSER_STOPPED(ctxt)) {
size_t j;
- int cur, size, cp;
+ int cur, size;
if (avail <= 64) {
if (!eof) {
@@ -3068,6 +3039,32 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
size = 1;
col += 1;
+ if (ncr) {
+ int lc = cur | 0x20;
+ int digit;
+
+ if ((cur >= '0') && (cur <= '9')) {
+ digit = cur - '0';
+ } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
+ digit = (lc - 'a') + 10;
+ } else {
+ if (cur == ';') {
+ in += 1;
+ size += 1;
+ ncrSize += 1;
+ }
+ goto next_chunk;
+ }
+
+ cp = cp * ncr + digit;
+ if (cp >= 0x110000)
+ cp = 0x110000;
+
+ ncrSize += 1;
+
+ goto next_char;
+ }
+
switch (cur) {
case '<':
if (mode == 0) {
@@ -3155,7 +3152,6 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
if ((mode != 0) && (mode != DATA_RCDATA))
break;
- cp = 0;
j = 1;
if ((j < avail) && (in[j] == '#')) {
@@ -3164,21 +3160,18 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
if ((in[j] | 0x20) == 'x') {
j += 1;
if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
- cp = htmlParseNCRHex(in + j, avail - j, &skip);
- skip += 3;
+ ncr = 16;
+ size = 3;
+ ncrSize = 3;
+ cp = 0;
}
} else if (IS_ASCII_DIGIT(in[j])) {
- cp = htmlParseNCR(in + j, avail - j, &skip);
- skip += 2;
+ ncr = 10;
+ size = 2;
+ ncrSize = 2;
+ cp = 0;
}
}
-
- if (cp > 0) {
- repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
- goto next_chunk;
- }
-
- skip = 0;
} else {
repl = htmlFindEntityPrefix(in + j,
avail - j,
@@ -3244,11 +3237,19 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
break;
}
+next_char:
in += size;
avail -= size;
}
next_chunk:
+ if (ncrSize > 0) {
+ skip = ncrSize;
+ in -= ncrSize;
+
+ repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
+ }
+
if (in > chunk) {
input->cur += in - chunk;
htmlCharDataSAXCallback(ctxt, chunk, in - chunk, mode);