diff --git a/HTMLparser.c b/HTMLparser.c
index 1d286aef..325d29b7 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -56,6 +56,37 @@
(((c) >= 'A') && ((c) <= 'F')) || \
(((c) >= 'a') && ((c) <= 'f')))
+typedef const unsigned htmlAsciiMask[2];
+
+static htmlAsciiMask MASK_DQ = {
+ 0,
+ 1u << ('"' - 32),
+};
+static htmlAsciiMask MASK_SQ = {
+ 0,
+ 1u << ('\'' - 32),
+};
+static htmlAsciiMask MASK_GT = {
+ 0,
+ 1u << ('>' - 32),
+};
+static htmlAsciiMask MASK_DASH = {
+ 0,
+ 1u << ('-' - 32),
+};
+static htmlAsciiMask MASK_WS_GT = {
+ 1u << 0x09 | 1u << 0x0A | 1u << 0x0C | 1u << 0x0D,
+ 1u << (' ' - 32) | 1u << ('>' - 32),
+};
+static htmlAsciiMask MASK_DQ_GT = {
+ 0,
+ 1u << ('"' - 32) | 1u << ('>' - 32),
+};
+static htmlAsciiMask MASK_SQ_GT = {
+ 0,
+ 1u << ('\'' - 32) | 1u << ('>' - 32),
+};
+
static int htmlOmittedDefaultValue = 1;
static int
@@ -238,9 +269,6 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
*
* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
*
- * NEXT Skip to the next character, this does the proper decoding
- * in UTF-8 mode. It also pop-up unfinished entities on the fly.
- * NEXTL(l) Skip the current unicode character of l xmlChars long.
* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
*/
@@ -271,29 +299,6 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
/* Imported from XML */
#define CUR (*ctxt->input->cur)
-#define NEXT xmlNextChar(ctxt)
-
-#define RAW (*ctxt->input->cur)
-
-
-#define NEXTL(l) do { \
- if ((CUR == '\n') || ((CUR == '\r') && (NXT(1) == '\n'))) { \
- ctxt->input->line++; ctxt->input->col = 1; \
- } else ctxt->input->col++; \
- ctxt->input->cur += l; \
- } while (0)
-
-/************
- \
- if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
- if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
- ************/
-
-#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
-
-#define COPY_BUF(b, i, v) \
- if (v < 0x80) b[i++] = v; \
- else i += xmlCopyCharMultiByte(&b[i],v)
/**
* htmlFindEncoding:
@@ -350,129 +355,11 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) {
return(ret);
}
-/**
- * htmlCurrentChar:
- * @ctxt: the HTML parser context
- * @len: pointer to the length of the char read
- *
- * The current char value, if using UTF-8 this may actually span multiple
- * bytes in the input buffer. Implement the end of line normalization:
- * 2.11 End-of-Line Handling
- * If the encoding is unspecified, in the case we find an ISO-Latin-1
- * char, then the encoding converter is plugged in automatically.
- *
- * Returns the current char value and its length
- */
-
static int
-htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
- const unsigned char *cur;
- unsigned char c;
- unsigned int val;
-
- if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)
- xmlParserGrow(ctxt);
-
- /*
- * We are supposed to handle UTF8, check it's valid
- * From rfc2044: encoding of the Unicode values on UTF-8:
- *
- * UCS-4 range (hex.) UTF-8 octet sequence (binary)
- * 0000 0000-0000 007F 0xxxxxxx
- * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
- * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
- *
- * Check for the 0x110000 limit too
- */
- cur = ctxt->input->cur;
- c = *cur;
- if (c < 0x80) {
- if (c == 0) {
- if (ctxt->input->cur < ctxt->input->end) {
- *len = 1;
- return(0xFFFD);
- } else {
- *len = 0;
- return(0);
- }
- } else if (c == 0x0D) {
- if (cur[1] == 0x0A)
- *len = 2;
- else
- *len = 1;
- return(0x0A);
- }
-
- *len = 1;
- return(c);
- } else {
- size_t avail;
-
- if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
- xmlChar * guess;
-
- guess = htmlFindEncoding(ctxt);
- if (guess == NULL) {
- xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
- } else {
- xmlSwitchEncodingName(ctxt, (const char *) guess);
- xmlFree(guess);
- }
- ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
-
- cur = ctxt->input->cur;
- c = *cur;
- }
-
- if ((c & 0x40) == 0)
- goto encoding_error;
-
- avail = ctxt->input->end - ctxt->input->cur;
-
- if ((avail < 2) || ((cur[1] & 0xc0) != 0x80))
- goto encoding_error;
- if ((c & 0xe0) == 0xe0) {
- if ((avail < 3) || ((cur[2] & 0xc0) != 0x80))
- goto encoding_error;
- if ((c & 0xf0) == 0xf0) {
- if (((c & 0xf8) != 0xf0) ||
- (avail < 4) || ((cur[3] & 0xc0) != 0x80))
- goto encoding_error;
- /* 4-byte code */
- *len = 4;
- val = (cur[0] & 0x7) << 18;
- val |= (cur[1] & 0x3f) << 12;
- val |= (cur[2] & 0x3f) << 6;
- val |= cur[3] & 0x3f;
- if (val < 0x10000)
- goto encoding_error;
- } else {
- /* 3-byte code */
- *len = 3;
- val = (cur[0] & 0xf) << 12;
- val |= (cur[1] & 0x3f) << 6;
- val |= cur[2] & 0x3f;
- if (val < 0x800)
- goto encoding_error;
- }
- } else {
- /* 2-byte code */
- *len = 2;
- val = (cur[0] & 0x1f) << 6;
- val |= cur[1] & 0x3f;
- if (val < 0x80)
- goto encoding_error;
- }
- return(val);
- }
-
-encoding_error:
- xmlCtxtErrIO(ctxt, XML_ERR_INVALID_ENCODING, NULL);
-
- if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
- xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
- *len = 1;
- return(*ctxt->input->cur);
+htmlMaskMatch(htmlAsciiMask mask, unsigned c) {
+ if (c >= 64)
+ return(0);
+ return((mask[c/32] >> (c & 31)) & 1);
}
static int
@@ -545,18 +432,46 @@ invalid:
static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
+ const xmlChar *cur = ctxt->input->cur;
+ size_t avail = ctxt->input->end - cur;
int res = 0;
+ int line = ctxt->input->line;
+ int col = ctxt->input->col;
+
+ while (!PARSER_STOPPED(ctxt)) {
+ if (avail == 0) {
+ ctxt->input->cur = cur;
+ GROW;
+ cur = ctxt->input->cur;
+ avail = ctxt->input->end - cur;
+
+ if (avail == 0)
+ break;
+ }
+
+ if (*cur == '\n') {
+ line++;
+ col = 1;
+ } else if (IS_WS_HTML(*cur)) {
+ col++;
+ } else {
+ break;
+ }
+
+ cur += 1;
+ avail -= 1;
- while (IS_WS_HTML(*(ctxt->input->cur))) {
- if (*(ctxt->input->cur) == '\n') {
- ctxt->input->line++; ctxt->input->col = 1;
- } else ctxt->input->col++;
- ctxt->input->cur++;
- if (*ctxt->input->cur == 0)
- xmlParserGrow(ctxt);
if (res < INT_MAX)
res++;
}
+
+ ctxt->input->cur = cur;
+ ctxt->input->line = line;
+ ctxt->input->col = col;
+
+ if (res > 8)
+ GROW;
+
return(res);
}
@@ -2003,21 +1918,6 @@ static const htmlEntityDesc html40EntitiesTable[] = {
* *
************************************************************************/
-/*
- * Macro used to grow the current buffer.
- */
-#define growBuffer(buffer) { \
- xmlChar *tmp; \
- buffer##_size *= 2; \
- tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size); \
- if (tmp == NULL) { \
- htmlErrMemory(ctxt); \
- xmlFree(buffer); \
- return(NULL); \
- } \
- buffer = tmp; \
-}
-
/**
* htmlEntityLookup:
* @name: the entity name
@@ -2748,120 +2648,277 @@ htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr,
return(match + 1);
}
-
/**
- * htmlParseHTMLAttribute:
+ * htmlParseData:
* @ctxt: an HTML parser context
- * @stop: a char stop value
+ * @mask: mask of terminating characters
+ * @comment: true if parsing a comment
+ * @refs: true if references are allowed
+ * @maxLength: maximum output length
*
- * parse an HTML attribute value till the stop (quote), if
- * stop is 0 then it stops at the first space
+ * Parse data until callback signals to stop.
*
- * Returns the attribute parsed or NULL
+ * Returns the parsed string or NULL in case of errors.
*/
static xmlChar *
-htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, int stop) {
- xmlChar *buffer = NULL;
- int buffer_size = 0;
- int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
- XML_MAX_HUGE_LENGTH :
- XML_MAX_TEXT_LENGTH;
- xmlChar *out = NULL;
+htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
+ int comment, int refs, int maxLength) {
+ xmlParserInputPtr input = ctxt->input;
+ xmlChar *ret = NULL;
+ xmlChar *buffer;
+ xmlChar utf8Char[4];
+ size_t buffer_size;
+ size_t used;
+ int eof = PARSER_PROGRESSIVE(ctxt);
+ int line, col;
+ int termSkip = -1;
- /*
- * allocate a translation buffer.
- */
- buffer_size = HTML_PARSER_BUFFER_SIZE;
- buffer = xmlMalloc(buffer_size);
+ used = 0;
+ buffer_size = ctxt->spaceMax;
+ buffer = (xmlChar *) ctxt->spaceTab;
if (buffer == NULL) {
- htmlErrMemory(ctxt);
- return(NULL);
- }
- out = buffer;
-
- /*
- * Ok loop until we reach one of the ending chars
- */
- while ((PARSER_STOPPED(ctxt) == 0) &&
- (ctxt->input->cur < ctxt->input->end) &&
- ((stop == 0) || (CUR != stop))) {
- if ((stop == 0) && (CUR == '>')) break;
- if ((stop == 0) && (IS_WS_HTML(CUR))) break;
-
- if (out - buffer > buffer_size - 100) {
- int indx = out - buffer;
-
- growBuffer(buffer);
- out = &buffer[indx];
- }
-
- GROW;
-
- if (CUR == '&') {
- if (NXT(1) == '#') {
- unsigned int c;
- int bits;
-
- c = htmlParseCharRef(ctxt);
- if (c < 0x80)
- { *out++ = c; bits= -6; }
- else if (c < 0x800)
- { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
- else if (c < 0x10000)
- { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
- else
- { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
-
- for ( ; bits >= 0; bits-= 6) {
- *out++ = ((c >> bits) & 0x3F) | 0x80;
- }
- } else {
- const xmlChar *repl;
- int nameLen, replLen;
-
- SKIP(1);
- repl = htmlFindEntityPrefix(CUR_PTR,
- ctxt->input->end - CUR_PTR,
- /* isAttr */ 1,
- &nameLen, &replLen);
-
- if (repl == NULL) {
- *out++ = '&';
- } else {
- memcpy(out, repl, replLen);
- out += replLen;
- SKIP(nameLen);
- }
- }
- } else {
- unsigned int c;
- int bits, l;
-
- c = CUR_CHAR(l);
- if (c < 0x80)
- { *out++ = c; bits= -6; }
- else if (c < 0x800)
- { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
- else if (c < 0x10000)
- { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
- else
- { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
-
- for ( ; bits >= 0; bits-= 6) {
- *out++ = ((c >> bits) & 0x3F) | 0x80;
- }
- NEXTL(l);
- }
- if (out - buffer > maxLength) {
- htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
- "attribute value too long\n", NULL, NULL);
- xmlFree(buffer);
+ buffer_size = 500;
+ buffer = xmlMalloc(buffer_size + 1);
+ if (buffer == NULL) {
+ htmlErrMemory(ctxt);
return(NULL);
}
}
- *out = 0;
- return(buffer);
+
+ line = input->line;
+ col = input->col;
+
+ while (!PARSER_STOPPED(ctxt)) {
+ const xmlChar *chunk, *in, *repl;
+ size_t avail, chunkSize, extraSize;
+ int replSize;
+ int skip = 0;
+
+ chunk = input->cur;
+ avail = input->end - chunk;
+ in = chunk;
+
+ repl = BAD_CAST "";
+ replSize = 0;
+
+ while (!PARSER_STOPPED(ctxt)) {
+ size_t j;
+ int cur, size, cp;
+
+ if ((!eof) && (avail <= 64)) {
+ size_t oldAvail = avail;
+ size_t off = in - chunk;
+
+ input->cur = in;
+
+ xmlParserGrow(ctxt);
+
+ in = input->cur;
+ chunk = in - off;
+ input->cur = chunk;
+ avail = input->end - in;
+
+ if (oldAvail == avail)
+ eof = 1;
+ }
+
+ if (avail == 0) {
+ termSkip = 0;
+ break;
+ }
+
+ cur = *in;
+ size = 1;
+ col += 1;
+
+ if (htmlMaskMatch(mask, cur)) {
+ if (comment) {
+ if (avail < 2) {
+ termSkip = 1;
+ } else if (in[1] == '-') {
+ if (avail < 3) {
+ termSkip = 2;
+ } else if (in[2] == '>') {
+ termSkip = 3;
+ } else if (in[2] == '!') {
+ if (avail < 4)
+ termSkip = 3;
+ else if (in[3] == '>')
+ termSkip = 4;
+ }
+ }
+
+ if (termSkip >= 0)
+ break;
+ } else {
+ termSkip = 0;
+ break;
+ }
+ }
+
+ switch (cur) {
+ case '&':
+ if (!refs)
+ break;
+
+ cp = 0;
+ j = 1;
+
+ if ((j < avail) && (in[j] == '#')) {
+ j += 1;
+ if (j < avail) {
+ if ((in[j] | 0x20) == 'x') {
+ j += 1;
+ if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
+ cp = htmlParseNCRHex(in + j, avail - j, &skip);
+ skip += 3;
+ }
+ } else if (IS_ASCII_DIGIT(in[j])) {
+ cp = htmlParseNCR(in + j, avail - j, &skip);
+ skip += 2;
+ }
+ }
+
+ if (cp > 0) {
+ repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
+ goto next_chunk;
+ }
+
+ skip = 0;
+ } else {
+ repl = htmlFindEntityPrefix(in + j,
+ avail - j,
+ /* isAttr */ 1,
+ &skip, &replSize);
+ if (repl != NULL) {
+ skip += 1;
+ goto next_chunk;
+ }
+
+ skip = 0;
+ }
+
+ break;
+
+ case '\0':
+ skip = 1;
+ repl = BAD_CAST "\xEF\xBF\xBD";
+ replSize = 3;
+ goto next_chunk;
+
+ case '\n':
+ line += 1;
+ col = 1;
+ break;
+
+ case '\r':
+ skip = 1;
+ if (in[1] != 0x0A) {
+ repl = BAD_CAST "\x0A";
+ replSize = 1;
+ }
+ goto next_chunk;
+
+ default:
+ if (cur < 0x80)
+ break;
+
+ if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
+ xmlChar * guess;
+
+ guess = htmlFindEncoding(ctxt);
+ if (guess == NULL) {
+ xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
+ } else {
+ xmlSwitchEncodingName(ctxt, (const char *) guess);
+ xmlFree(guess);
+ }
+ input->flags |= XML_INPUT_HAS_ENCODING;
+
+ goto restart;
+ }
+
+ size = htmlValidateUtf8(ctxt, in, avail);
+
+ if (size <= 0) {
+ skip = 1;
+ repl = BAD_CAST "\xEF\xBF\xBD";
+ replSize = 3;
+ goto next_chunk;
+ }
+
+ break;
+ }
+
+ in += size;
+ avail -= size;
+ }
+
+next_chunk:
+ chunkSize = in - chunk;
+ extraSize = chunkSize + replSize;
+
+ if (extraSize > maxLength - used) {
+ htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT,
+ "value too long\n", NULL, NULL);
+ goto error;
+ }
+
+ if (extraSize > buffer_size - used) {
+ size_t newSize = (used + extraSize) * 2;
+ xmlChar *tmp = (xmlChar *) xmlRealloc(buffer, newSize + 1);
+
+ if (tmp == NULL) {
+ htmlErrMemory(ctxt);
+ goto error;
+ }
+ buffer = tmp;
+ buffer_size = newSize;
+ }
+
+ if (chunkSize > 0) {
+ input->cur += chunkSize;
+ memcpy(buffer + used, chunk, chunkSize);
+ used += chunkSize;
+ }
+
+ input->cur += skip;
+ if (replSize > 0) {
+ memcpy(buffer + used, repl, replSize);
+ used += replSize;
+ }
+
+ SHRINK;
+
+ if (termSkip >= 0)
+ break;
+
+restart:
+ ;
+ }
+
+ if (termSkip > 0) {
+ input->cur += termSkip;
+ col += termSkip;
+ }
+
+ input->line = line;
+ input->col = col;
+
+ ret = xmlMalloc(used + 1);
+ if (ret == NULL) {
+ htmlErrMemory(ctxt);
+ } else {
+ memcpy(ret, buffer, used);
+ ret[used] = 0;
+ }
+
+error:
+ ctxt->spaceTab = (void *) buffer;
+ ctxt->spaceMax = buffer_size;
+
+ return(ret);
}
/**
@@ -2894,22 +2951,22 @@ htmlParseEntityRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED,
static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
xmlChar *ret = NULL;
+ int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
+ XML_MAX_HUGE_LENGTH :
+ XML_MAX_TEXT_LENGTH;
if (CUR == '"') {
SKIP(1);
- ret = htmlParseHTMLAttribute(ctxt, '"');
+ ret = htmlParseData(ctxt, MASK_DQ, 0, 1, maxLength);
if (CUR == '"')
- SKIP(1);
+ SKIP(1);
} else if (CUR == '\'') {
SKIP(1);
- ret = htmlParseHTMLAttribute(ctxt, '\'');
+ ret = htmlParseData(ctxt, MASK_SQ, 0, 1, maxLength);
if (CUR == '\'')
- SKIP(1);
+ SKIP(1);
} else {
- /*
- * That's an HTMLism, the attribute value may not be quoted
- */
- ret = htmlParseHTMLAttribute(ctxt, 0);
+ ret = htmlParseData(ctxt, MASK_WS_GT, 0, 1, maxLength);
}
return(ret);
}
@@ -3230,107 +3287,36 @@ restart:
*/
static void
htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) {
+ const xmlChar *comment = BAD_CAST "";
xmlChar *buf = NULL;
- int len;
- int size = HTML_PARSER_BUFFER_SIZE;
- int cur, l;
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
XML_MAX_HUGE_LENGTH :
XML_MAX_TEXT_LENGTH;
- xmlParserInputState state;
- state = ctxt->instate;
- ctxt->instate = XML_PARSER_COMMENT;
-
- buf = xmlMalloc(size);
- if (buf == NULL) {
- htmlErrMemory(ctxt);
- return;
- }
- len = 0;
- buf[len] = 0;
-
- cur = CUR_CHAR(l);
- if (!bogus) {
- if (cur == '>') {
+ if (bogus) {
+ buf = htmlParseData(ctxt, MASK_GT, 0, 0, maxLength);
+ if (CUR == '>')
SKIP(1);
- goto done;
- } else if ((cur == '-') && (NXT(1) == '>')) {
+ comment = buf;
+ } else {
+ if (CUR == '>') {
+ SKIP(1);
+ } else if ((CUR == '-') && (NXT(1) == '>')) {
SKIP(2);
- goto done;
- }
- }
-
- while (cur != 0) {
- if (bogus) {
- if (cur == '>') {
- SKIP(1);
- break;
- }
} else {
- if (cur == '-') {
- size_t avail = ctxt->input->end - ctxt->input->cur;
-
- if (avail < 2) {
- SKIP(1);
- break;
- } else if (NXT(1) == '-') {
- if (avail < 3) {
- SKIP(2);
- break;
- } else if (NXT(2) == '>') {
- SKIP(3);
- break;
- } else if (NXT(2) == '!') {
- if (avail < 4) {
- SKIP(3);
- break;
-
- } else if (NXT(3) == '>') {
- SKIP(4);
- break;
- }
- }
- }
- }
- }
-
- if (len + 5 >= size) {
- xmlChar *tmp;
-
- size *= 2;
- tmp = (xmlChar *) xmlRealloc(buf, size);
- if (tmp == NULL) {
- xmlFree(buf);
- htmlErrMemory(ctxt);
- ctxt->instate = state;
- return;
- }
- buf = tmp;
- }
-
- COPY_BUF(buf,len,cur);
- if (len > maxLength) {
- htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
- "comment too long", NULL, NULL);
- xmlFree(buf);
- ctxt->instate = state;
- return;
+ buf = htmlParseData(ctxt, MASK_DASH, 1, 0, maxLength);
+ comment = buf;
}
-
- NEXTL(l);
- cur = CUR_CHAR(l);
}
-done:
- buf[len] = 0;
+ if (comment == NULL)
+ return;
+
if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
(!ctxt->disableSAX))
- ctxt->sax->comment(ctxt->userData, buf);
- xmlFree(buf);
+ ctxt->sax->comment(ctxt->userData, comment);
- ctxt->instate = state;
- return;
+ xmlFree(buf);
}
/**
@@ -3339,70 +3325,11 @@ done:
*
* DEPRECATED: Internal function, don't use.
*
- * parse Reference declarations
- *
- * [66] CharRef ::= '' [0-9]+ ';' |
- * '' [0-9a-fA-F]+ ';'
- *
- * Returns the value parsed (as an int)
+ * Returns 0
*/
int
-htmlParseCharRef(htmlParserCtxtPtr ctxt) {
- int val = 0;
-
- if ((ctxt == NULL) || (ctxt->input == NULL))
- return(0);
-
- if ((CUR == '&') && (NXT(1) == '#') &&
- ((NXT(2) == 'x') || NXT(2) == 'X')) {
- SKIP(3);
- while (1) {
- int c = CUR;
-
- if ((c >= '0') && (c <= '9')) {
- c -= '0';
- } else if ((c >= 'a') && (c <= 'f')) {
- c = (c - 'a') + 10;
- } else if ((c >= 'A') && (c <= 'F')) {
- c = (c - 'A') + 10;
- } else {
- break;
- }
- val = val * 16 + c;
- if (val >= 0x110000)
- val = 0x110000;
- NEXT;
- }
- if (CUR == ';')
- SKIP(1);
- } else if ((CUR == '&') && (NXT(1) == '#')) {
- SKIP(2);
- while (1) {
- int c = CUR;
-
- if ((c < '0') || (c > '9'))
- break;
- val = val * 10 + (c - '0');
- if (val >= 0x110000)
- val = 0x110000;
- NEXT;
- }
- if (CUR == ';')
- SKIP(1);
- }
-
- /*
- * Remap C1 control characters
- */
- if ((val >= 0x80) && (val < 0xA0)) {
- val = htmlC1Remap[val - 0x80];
- } else if ((val <= 0) ||
- ((val >= 0xD800) && (val < 0xE000)) ||
- (val > 0x10FFFF)) {
- val = 0xFFFD;
- }
-
- return(val);
+htmlParseCharRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED) {
+ return(0);
}
@@ -3417,63 +3344,81 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
static xmlChar *
htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) {
- xmlChar *buf = NULL;
- int len;
- int size = HTML_PARSER_BUFFER_SIZE;
- int quote, cur, l;
+ xmlChar *ret;
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
XML_MAX_TEXT_LENGTH :
XML_MAX_NAME_LENGTH;
- if ((CUR != '"') && (CUR != '\''))
- return(NULL);
- quote = CUR;
- NEXT;
-
- buf = xmlMalloc(size);
- if (buf == NULL) {
- htmlErrMemory(ctxt);
- return(NULL);
- }
- len = 0;
-
- while (ctxt->input->cur < ctxt->input->end) {
- cur = CUR_CHAR(l);
-
- if (cur == '>')
- break;
-
- if (cur == quote) {
+ if (CUR == '"') {
+ SKIP(1);
+ ret = htmlParseData(ctxt, MASK_DQ_GT, 0, 0, maxLength);
+ if (CUR == '"')
SKIP(1);
- break;
- }
-
- if (len + 5 >= size) {
- xmlChar *tmp;
-
- size *= 2;
- tmp = (xmlChar *) xmlRealloc(buf, size);
- if (tmp == NULL) {
- xmlFree(buf);
- htmlErrMemory(ctxt);
- return(NULL);
- }
- buf = tmp;
- }
-
- COPY_BUF(buf,len,cur);
- if (len > maxLength) {
- htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT,
- "identifier too long", NULL, NULL);
- xmlFree(buf);
- return(NULL);
- }
-
- NEXTL(l);
+ } else if (CUR == '\'') {
+ SKIP(1);
+ ret = htmlParseData(ctxt, MASK_SQ_GT, 0, 0, maxLength);
+ if (CUR == '\'')
+ SKIP(1);
+ } else {
+ return(NULL);
}
- buf[len] = 0;
- return(buf);
+ return(ret);
+}
+
+static void
+htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt) {
+ const xmlChar *in;
+ size_t avail;
+ int eof = PARSER_PROGRESSIVE(ctxt);
+ int line, col;
+
+ line = ctxt->input->line;
+ col = ctxt->input->col;
+
+ in = ctxt->input->cur;
+ avail = ctxt->input->end - in;
+
+ while (!PARSER_STOPPED(ctxt)) {
+ int cur;
+
+ if ((!eof) && (avail <= 64)) {
+ size_t oldAvail = avail;
+
+ ctxt->input->cur = in;
+
+ xmlParserGrow(ctxt);
+
+ in = ctxt->input->cur;
+ avail = ctxt->input->end - in;
+
+ if (oldAvail == avail)
+ eof = 1;
+ }
+
+ if (avail == 0)
+ break;
+
+ col += 1;
+
+ cur = *in;
+ if (cur == '>') {
+ in += 1;
+ break;
+ } else if (cur == 0x0A) {
+ line += 1;
+ col = 1;
+ }
+
+ in += 1;
+ avail -= 1;
+
+ SHRINK;
+ }
+
+ ctxt->input->cur = in;
+ ctxt->input->line = line;
+ ctxt->input->col = col;
}
/**
@@ -3488,7 +3433,6 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
xmlChar *name = NULL;
xmlChar *publicId = NULL;
xmlChar *URI = NULL;
- int nameCap, nameSize;
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
XML_MAX_TEXT_LENGTH :
XML_MAX_NAME_LENGTH;
@@ -3500,60 +3444,21 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
SKIP_BLANKS;
- nameCap = 0;
- nameSize = 0;
- while (ctxt->input->cur < ctxt->input->end) {
- int l;
- int c = CUR_CHAR(l);
+ if ((ctxt->input->cur < ctxt->input->end) && (CUR != '>')) {
+ name = htmlParseData(ctxt, MASK_WS_GT, 0, 0, maxLength);
- if (c == '>')
- break;
+ if ((ctxt->options & HTML_PARSE_HTML5) && (name != NULL)) {
+ xmlChar *cur;
- if (nameSize + 5 > nameCap) {
- size_t newCap = nameCap ? nameCap * 2 : 32;
- xmlChar *tmp = xmlRealloc(name, newCap);
-
- if (tmp == NULL) {
- htmlErrMemory(ctxt);
- xmlFree(name);
- return;
+ for (cur = name; *cur; cur++) {
+ if ((*cur >= 'A') && (*cur <= 'Z'))
+ *cur += 0x20;
}
-
- name = tmp;
- nameCap = newCap;
}
- if (c < 0x80) {
- if (IS_WS_HTML(c))
- break;
-
- if ((ctxt->options & HTML_PARSE_HTML5) &&
- (c >= 'A') && (c <= 'Z'))
- c += 32;
-
- name[nameSize++] = c;
- } else {
- COPY_BUF(name, nameSize, c);
- }
-
- if (nameSize > maxLength) {
- htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT,
- "identifier too long", NULL, NULL);
- goto bogus;
- }
-
- NEXTL(l);
+ SKIP_BLANKS;
}
- if (name != NULL)
- name[nameSize] = 0;
-
- /*
- * Check that upper(name) == "HTML" !!!!!!!!!!!!!
- */
-
- SKIP_BLANKS;
-
/*
* Check for SystemID and publicId
*/
@@ -3576,14 +3481,7 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
}
bogus:
- /* Ignore bogus content */
- while (ctxt->input->cur < ctxt->input->end) {
- int c = CUR;
-
- NEXT;
- if (c == '>')
- break;
- }
+ htmlSkipBogusDoctype(ctxt);
/*
* Create or update the document accordingly to the DOCTYPE
@@ -3821,7 +3719,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
(PARSER_STOPPED(ctxt) == 0)) {
/* unexpected-solidus-in-tag */
if (CUR == '/') {
- NEXT;
+ SKIP(1);
SKIP_BLANKS;
continue;
}
@@ -3973,7 +3871,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
/* unexpected-solidus-in-tag */
if (CUR == '/') {
- NEXT;
+ SKIP(1);
SKIP_BLANKS;
continue;
}
@@ -3986,7 +3884,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
}
if (CUR == '>') {
- NEXT;
+ SKIP(1);
} else if ((CUR == '/') && (NXT(1) == '>')) {
SKIP(2);
} else {