diff --git a/HTMLparser.c b/HTMLparser.c index 1d286aef..325d29b7 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -56,6 +56,37 @@ (((c) >= 'A') && ((c) <= 'F')) || \ (((c) >= 'a') && ((c) <= 'f'))) +typedef const unsigned htmlAsciiMask[2]; + +static htmlAsciiMask MASK_DQ = { + 0, + 1u << ('"' - 32), +}; +static htmlAsciiMask MASK_SQ = { + 0, + 1u << ('\'' - 32), +}; +static htmlAsciiMask MASK_GT = { + 0, + 1u << ('>' - 32), +}; +static htmlAsciiMask MASK_DASH = { + 0, + 1u << ('-' - 32), +}; +static htmlAsciiMask MASK_WS_GT = { + 1u << 0x09 | 1u << 0x0A | 1u << 0x0C | 1u << 0x0D, + 1u << (' ' - 32) | 1u << ('>' - 32), +}; +static htmlAsciiMask MASK_DQ_GT = { + 0, + 1u << ('"' - 32) | 1u << ('>' - 32), +}; +static htmlAsciiMask MASK_SQ_GT = { + 0, + 1u << ('\'' - 32) | 1u << ('>' - 32), +}; + static int htmlOmittedDefaultValue = 1; static int @@ -238,9 +269,6 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt) * * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding * - * NEXT Skip to the next character, this does the proper decoding - * in UTF-8 mode. It also pop-up unfinished entities on the fly. - * NEXTL(l) Skip the current unicode character of l xmlChars long. * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly */ @@ -271,29 +299,6 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt) /* Imported from XML */ #define CUR (*ctxt->input->cur) -#define NEXT xmlNextChar(ctxt) - -#define RAW (*ctxt->input->cur) - - -#define NEXTL(l) do { \ - if ((CUR == '\n') || ((CUR == '\r') && (NXT(1) == '\n'))) { \ - ctxt->input->line++; ctxt->input->col = 1; \ - } else ctxt->input->col++; \ - ctxt->input->cur += l; \ - } while (0) - -/************ - \ - if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ - if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); - ************/ - -#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) - -#define COPY_BUF(b, i, v) \ - if (v < 0x80) b[i++] = v; \ - else i += xmlCopyCharMultiByte(&b[i],v) /** * htmlFindEncoding: @@ -350,129 +355,11 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) { return(ret); } -/** - * htmlCurrentChar: - * @ctxt: the HTML parser context - * @len: pointer to the length of the char read - * - * The current char value, if using UTF-8 this may actually span multiple - * bytes in the input buffer. Implement the end of line normalization: - * 2.11 End-of-Line Handling - * If the encoding is unspecified, in the case we find an ISO-Latin-1 - * char, then the encoding converter is plugged in automatically. - * - * Returns the current char value and its length - */ - static int -htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { - const unsigned char *cur; - unsigned char c; - unsigned int val; - - if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) - xmlParserGrow(ctxt); - - /* - * We are supposed to handle UTF8, check it's valid - * From rfc2044: encoding of the Unicode values on UTF-8: - * - * UCS-4 range (hex.) UTF-8 octet sequence (binary) - * 0000 0000-0000 007F 0xxxxxxx - * 0000 0080-0000 07FF 110xxxxx 10xxxxxx - * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx - * - * Check for the 0x110000 limit too - */ - cur = ctxt->input->cur; - c = *cur; - if (c < 0x80) { - if (c == 0) { - if (ctxt->input->cur < ctxt->input->end) { - *len = 1; - return(0xFFFD); - } else { - *len = 0; - return(0); - } - } else if (c == 0x0D) { - if (cur[1] == 0x0A) - *len = 2; - else - *len = 1; - return(0x0A); - } - - *len = 1; - return(c); - } else { - size_t avail; - - if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) { - xmlChar * guess; - - guess = htmlFindEncoding(ctxt); - if (guess == NULL) { - xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); - } else { - xmlSwitchEncodingName(ctxt, (const char *) guess); - xmlFree(guess); - } - ctxt->input->flags |= XML_INPUT_HAS_ENCODING; - - cur = ctxt->input->cur; - c = *cur; - } - - if ((c & 0x40) == 0) - goto encoding_error; - - avail = ctxt->input->end - ctxt->input->cur; - - if ((avail < 2) || ((cur[1] & 0xc0) != 0x80)) - goto encoding_error; - if ((c & 0xe0) == 0xe0) { - if ((avail < 3) || ((cur[2] & 0xc0) != 0x80)) - goto encoding_error; - if ((c & 0xf0) == 0xf0) { - if (((c & 0xf8) != 0xf0) || - (avail < 4) || ((cur[3] & 0xc0) != 0x80)) - goto encoding_error; - /* 4-byte code */ - *len = 4; - val = (cur[0] & 0x7) << 18; - val |= (cur[1] & 0x3f) << 12; - val |= (cur[2] & 0x3f) << 6; - val |= cur[3] & 0x3f; - if (val < 0x10000) - goto encoding_error; - } else { - /* 3-byte code */ - *len = 3; - val = (cur[0] & 0xf) << 12; - val |= (cur[1] & 0x3f) << 6; - val |= cur[2] & 0x3f; - if (val < 0x800) - goto encoding_error; - } - } else { - /* 2-byte code */ - *len = 2; - val = (cur[0] & 0x1f) << 6; - val |= cur[1] & 0x3f; - if (val < 0x80) - goto encoding_error; - } - return(val); - } - -encoding_error: - xmlCtxtErrIO(ctxt, XML_ERR_INVALID_ENCODING, NULL); - - if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) - xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); - *len = 1; - return(*ctxt->input->cur); +htmlMaskMatch(htmlAsciiMask mask, unsigned c) { + if (c >= 64) + return(0); + return((mask[c/32] >> (c & 31)) & 1); } static int @@ -545,18 +432,46 @@ invalid: static int htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { + const xmlChar *cur = ctxt->input->cur; + size_t avail = ctxt->input->end - cur; int res = 0; + int line = ctxt->input->line; + int col = ctxt->input->col; + + while (!PARSER_STOPPED(ctxt)) { + if (avail == 0) { + ctxt->input->cur = cur; + GROW; + cur = ctxt->input->cur; + avail = ctxt->input->end - cur; + + if (avail == 0) + break; + } + + if (*cur == '\n') { + line++; + col = 1; + } else if (IS_WS_HTML(*cur)) { + col++; + } else { + break; + } + + cur += 1; + avail -= 1; - while (IS_WS_HTML(*(ctxt->input->cur))) { - if (*(ctxt->input->cur) == '\n') { - ctxt->input->line++; ctxt->input->col = 1; - } else ctxt->input->col++; - ctxt->input->cur++; - if (*ctxt->input->cur == 0) - xmlParserGrow(ctxt); if (res < INT_MAX) res++; } + + ctxt->input->cur = cur; + ctxt->input->line = line; + ctxt->input->col = col; + + if (res > 8) + GROW; + return(res); } @@ -2003,21 +1918,6 @@ static const htmlEntityDesc html40EntitiesTable[] = { * * ************************************************************************/ -/* - * Macro used to grow the current buffer. - */ -#define growBuffer(buffer) { \ - xmlChar *tmp; \ - buffer##_size *= 2; \ - tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size); \ - if (tmp == NULL) { \ - htmlErrMemory(ctxt); \ - xmlFree(buffer); \ - return(NULL); \ - } \ - buffer = tmp; \ -} - /** * htmlEntityLookup: * @name: the entity name @@ -2748,120 +2648,277 @@ htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr, return(match + 1); } - /** - * htmlParseHTMLAttribute: + * htmlParseData: * @ctxt: an HTML parser context - * @stop: a char stop value + * @mask: mask of terminating characters + * @comment: true if parsing a comment + * @refs: true if references are allowed + * @maxLength: maximum output length * - * parse an HTML attribute value till the stop (quote), if - * stop is 0 then it stops at the first space + * Parse data until callback signals to stop. * - * Returns the attribute parsed or NULL + * Returns the parsed string or NULL in case of errors. */ static xmlChar * -htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, int stop) { - xmlChar *buffer = NULL; - int buffer_size = 0; - int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? - XML_MAX_HUGE_LENGTH : - XML_MAX_TEXT_LENGTH; - xmlChar *out = NULL; +htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask, + int comment, int refs, int maxLength) { + xmlParserInputPtr input = ctxt->input; + xmlChar *ret = NULL; + xmlChar *buffer; + xmlChar utf8Char[4]; + size_t buffer_size; + size_t used; + int eof = PARSER_PROGRESSIVE(ctxt); + int line, col; + int termSkip = -1; - /* - * allocate a translation buffer. - */ - buffer_size = HTML_PARSER_BUFFER_SIZE; - buffer = xmlMalloc(buffer_size); + used = 0; + buffer_size = ctxt->spaceMax; + buffer = (xmlChar *) ctxt->spaceTab; if (buffer == NULL) { - htmlErrMemory(ctxt); - return(NULL); - } - out = buffer; - - /* - * Ok loop until we reach one of the ending chars - */ - while ((PARSER_STOPPED(ctxt) == 0) && - (ctxt->input->cur < ctxt->input->end) && - ((stop == 0) || (CUR != stop))) { - if ((stop == 0) && (CUR == '>')) break; - if ((stop == 0) && (IS_WS_HTML(CUR))) break; - - if (out - buffer > buffer_size - 100) { - int indx = out - buffer; - - growBuffer(buffer); - out = &buffer[indx]; - } - - GROW; - - if (CUR == '&') { - if (NXT(1) == '#') { - unsigned int c; - int bits; - - c = htmlParseCharRef(ctxt); - if (c < 0x80) - { *out++ = c; bits= -6; } - else if (c < 0x800) - { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } - else if (c < 0x10000) - { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } - else - { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } - - for ( ; bits >= 0; bits-= 6) { - *out++ = ((c >> bits) & 0x3F) | 0x80; - } - } else { - const xmlChar *repl; - int nameLen, replLen; - - SKIP(1); - repl = htmlFindEntityPrefix(CUR_PTR, - ctxt->input->end - CUR_PTR, - /* isAttr */ 1, - &nameLen, &replLen); - - if (repl == NULL) { - *out++ = '&'; - } else { - memcpy(out, repl, replLen); - out += replLen; - SKIP(nameLen); - } - } - } else { - unsigned int c; - int bits, l; - - c = CUR_CHAR(l); - if (c < 0x80) - { *out++ = c; bits= -6; } - else if (c < 0x800) - { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } - else if (c < 0x10000) - { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } - else - { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } - - for ( ; bits >= 0; bits-= 6) { - *out++ = ((c >> bits) & 0x3F) | 0x80; - } - NEXTL(l); - } - if (out - buffer > maxLength) { - htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, - "attribute value too long\n", NULL, NULL); - xmlFree(buffer); + buffer_size = 500; + buffer = xmlMalloc(buffer_size + 1); + if (buffer == NULL) { + htmlErrMemory(ctxt); return(NULL); } } - *out = 0; - return(buffer); + + line = input->line; + col = input->col; + + while (!PARSER_STOPPED(ctxt)) { + const xmlChar *chunk, *in, *repl; + size_t avail, chunkSize, extraSize; + int replSize; + int skip = 0; + + chunk = input->cur; + avail = input->end - chunk; + in = chunk; + + repl = BAD_CAST ""; + replSize = 0; + + while (!PARSER_STOPPED(ctxt)) { + size_t j; + int cur, size, cp; + + if ((!eof) && (avail <= 64)) { + size_t oldAvail = avail; + size_t off = in - chunk; + + input->cur = in; + + xmlParserGrow(ctxt); + + in = input->cur; + chunk = in - off; + input->cur = chunk; + avail = input->end - in; + + if (oldAvail == avail) + eof = 1; + } + + if (avail == 0) { + termSkip = 0; + break; + } + + cur = *in; + size = 1; + col += 1; + + if (htmlMaskMatch(mask, cur)) { + if (comment) { + if (avail < 2) { + termSkip = 1; + } else if (in[1] == '-') { + if (avail < 3) { + termSkip = 2; + } else if (in[2] == '>') { + termSkip = 3; + } else if (in[2] == '!') { + if (avail < 4) + termSkip = 3; + else if (in[3] == '>') + termSkip = 4; + } + } + + if (termSkip >= 0) + break; + } else { + termSkip = 0; + break; + } + } + + switch (cur) { + case '&': + if (!refs) + break; + + cp = 0; + j = 1; + + if ((j < avail) && (in[j] == '#')) { + j += 1; + if (j < avail) { + if ((in[j] | 0x20) == 'x') { + j += 1; + if ((j < avail) && (IS_HEX_DIGIT(in[j]))) { + cp = htmlParseNCRHex(in + j, avail - j, &skip); + skip += 3; + } + } else if (IS_ASCII_DIGIT(in[j])) { + cp = htmlParseNCR(in + j, avail - j, &skip); + skip += 2; + } + } + + if (cp > 0) { + repl = htmlCodePointToUtf8(cp, utf8Char, &replSize); + goto next_chunk; + } + + skip = 0; + } else { + repl = htmlFindEntityPrefix(in + j, + avail - j, + /* isAttr */ 1, + &skip, &replSize); + if (repl != NULL) { + skip += 1; + goto next_chunk; + } + + skip = 0; + } + + break; + + case '\0': + skip = 1; + repl = BAD_CAST "\xEF\xBF\xBD"; + replSize = 3; + goto next_chunk; + + case '\n': + line += 1; + col = 1; + break; + + case '\r': + skip = 1; + if (in[1] != 0x0A) { + repl = BAD_CAST "\x0A"; + replSize = 1; + } + goto next_chunk; + + default: + if (cur < 0x80) + break; + + if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) { + xmlChar * guess; + + guess = htmlFindEncoding(ctxt); + if (guess == NULL) { + xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); + } else { + xmlSwitchEncodingName(ctxt, (const char *) guess); + xmlFree(guess); + } + input->flags |= XML_INPUT_HAS_ENCODING; + + goto restart; + } + + size = htmlValidateUtf8(ctxt, in, avail); + + if (size <= 0) { + skip = 1; + repl = BAD_CAST "\xEF\xBF\xBD"; + replSize = 3; + goto next_chunk; + } + + break; + } + + in += size; + avail -= size; + } + +next_chunk: + chunkSize = in - chunk; + extraSize = chunkSize + replSize; + + if (extraSize > maxLength - used) { + htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT, + "value too long\n", NULL, NULL); + goto error; + } + + if (extraSize > buffer_size - used) { + size_t newSize = (used + extraSize) * 2; + xmlChar *tmp = (xmlChar *) xmlRealloc(buffer, newSize + 1); + + if (tmp == NULL) { + htmlErrMemory(ctxt); + goto error; + } + buffer = tmp; + buffer_size = newSize; + } + + if (chunkSize > 0) { + input->cur += chunkSize; + memcpy(buffer + used, chunk, chunkSize); + used += chunkSize; + } + + input->cur += skip; + if (replSize > 0) { + memcpy(buffer + used, repl, replSize); + used += replSize; + } + + SHRINK; + + if (termSkip >= 0) + break; + +restart: + ; + } + + if (termSkip > 0) { + input->cur += termSkip; + col += termSkip; + } + + input->line = line; + input->col = col; + + ret = xmlMalloc(used + 1); + if (ret == NULL) { + htmlErrMemory(ctxt); + } else { + memcpy(ret, buffer, used); + ret[used] = 0; + } + +error: + ctxt->spaceTab = (void *) buffer; + ctxt->spaceMax = buffer_size; + + return(ret); } /** @@ -2894,22 +2951,22 @@ htmlParseEntityRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, static xmlChar * htmlParseAttValue(htmlParserCtxtPtr ctxt) { xmlChar *ret = NULL; + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? + XML_MAX_HUGE_LENGTH : + XML_MAX_TEXT_LENGTH; if (CUR == '"') { SKIP(1); - ret = htmlParseHTMLAttribute(ctxt, '"'); + ret = htmlParseData(ctxt, MASK_DQ, 0, 1, maxLength); if (CUR == '"') - SKIP(1); + SKIP(1); } else if (CUR == '\'') { SKIP(1); - ret = htmlParseHTMLAttribute(ctxt, '\''); + ret = htmlParseData(ctxt, MASK_SQ, 0, 1, maxLength); if (CUR == '\'') - SKIP(1); + SKIP(1); } else { - /* - * That's an HTMLism, the attribute value may not be quoted - */ - ret = htmlParseHTMLAttribute(ctxt, 0); + ret = htmlParseData(ctxt, MASK_WS_GT, 0, 1, maxLength); } return(ret); } @@ -3230,107 +3287,36 @@ restart: */ static void htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) { + const xmlChar *comment = BAD_CAST ""; xmlChar *buf = NULL; - int len; - int size = HTML_PARSER_BUFFER_SIZE; - int cur, l; int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? XML_MAX_HUGE_LENGTH : XML_MAX_TEXT_LENGTH; - xmlParserInputState state; - state = ctxt->instate; - ctxt->instate = XML_PARSER_COMMENT; - - buf = xmlMalloc(size); - if (buf == NULL) { - htmlErrMemory(ctxt); - return; - } - len = 0; - buf[len] = 0; - - cur = CUR_CHAR(l); - if (!bogus) { - if (cur == '>') { + if (bogus) { + buf = htmlParseData(ctxt, MASK_GT, 0, 0, maxLength); + if (CUR == '>') SKIP(1); - goto done; - } else if ((cur == '-') && (NXT(1) == '>')) { + comment = buf; + } else { + if (CUR == '>') { + SKIP(1); + } else if ((CUR == '-') && (NXT(1) == '>')) { SKIP(2); - goto done; - } - } - - while (cur != 0) { - if (bogus) { - if (cur == '>') { - SKIP(1); - break; - } } else { - if (cur == '-') { - size_t avail = ctxt->input->end - ctxt->input->cur; - - if (avail < 2) { - SKIP(1); - break; - } else if (NXT(1) == '-') { - if (avail < 3) { - SKIP(2); - break; - } else if (NXT(2) == '>') { - SKIP(3); - break; - } else if (NXT(2) == '!') { - if (avail < 4) { - SKIP(3); - break; - - } else if (NXT(3) == '>') { - SKIP(4); - break; - } - } - } - } - } - - if (len + 5 >= size) { - xmlChar *tmp; - - size *= 2; - tmp = (xmlChar *) xmlRealloc(buf, size); - if (tmp == NULL) { - xmlFree(buf); - htmlErrMemory(ctxt); - ctxt->instate = state; - return; - } - buf = tmp; - } - - COPY_BUF(buf,len,cur); - if (len > maxLength) { - htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, - "comment too long", NULL, NULL); - xmlFree(buf); - ctxt->instate = state; - return; + buf = htmlParseData(ctxt, MASK_DASH, 1, 0, maxLength); + comment = buf; } - - NEXTL(l); - cur = CUR_CHAR(l); } -done: - buf[len] = 0; + if (comment == NULL) + return; + if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && (!ctxt->disableSAX)) - ctxt->sax->comment(ctxt->userData, buf); - xmlFree(buf); + ctxt->sax->comment(ctxt->userData, comment); - ctxt->instate = state; - return; + xmlFree(buf); } /** @@ -3339,70 +3325,11 @@ done: * * DEPRECATED: Internal function, don't use. * - * parse Reference declarations - * - * [66] CharRef ::= '&#' [0-9]+ ';' | - * '&#x' [0-9a-fA-F]+ ';' - * - * Returns the value parsed (as an int) + * Returns 0 */ int -htmlParseCharRef(htmlParserCtxtPtr ctxt) { - int val = 0; - - if ((ctxt == NULL) || (ctxt->input == NULL)) - return(0); - - if ((CUR == '&') && (NXT(1) == '#') && - ((NXT(2) == 'x') || NXT(2) == 'X')) { - SKIP(3); - while (1) { - int c = CUR; - - if ((c >= '0') && (c <= '9')) { - c -= '0'; - } else if ((c >= 'a') && (c <= 'f')) { - c = (c - 'a') + 10; - } else if ((c >= 'A') && (c <= 'F')) { - c = (c - 'A') + 10; - } else { - break; - } - val = val * 16 + c; - if (val >= 0x110000) - val = 0x110000; - NEXT; - } - if (CUR == ';') - SKIP(1); - } else if ((CUR == '&') && (NXT(1) == '#')) { - SKIP(2); - while (1) { - int c = CUR; - - if ((c < '0') || (c > '9')) - break; - val = val * 10 + (c - '0'); - if (val >= 0x110000) - val = 0x110000; - NEXT; - } - if (CUR == ';') - SKIP(1); - } - - /* - * Remap C1 control characters - */ - if ((val >= 0x80) && (val < 0xA0)) { - val = htmlC1Remap[val - 0x80]; - } else if ((val <= 0) || - ((val >= 0xD800) && (val < 0xE000)) || - (val > 0x10FFFF)) { - val = 0xFFFD; - } - - return(val); +htmlParseCharRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED) { + return(0); } @@ -3417,63 +3344,81 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) { static xmlChar * htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) { - xmlChar *buf = NULL; - int len; - int size = HTML_PARSER_BUFFER_SIZE; - int quote, cur, l; + xmlChar *ret; int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? XML_MAX_TEXT_LENGTH : XML_MAX_NAME_LENGTH; - if ((CUR != '"') && (CUR != '\'')) - return(NULL); - quote = CUR; - NEXT; - - buf = xmlMalloc(size); - if (buf == NULL) { - htmlErrMemory(ctxt); - return(NULL); - } - len = 0; - - while (ctxt->input->cur < ctxt->input->end) { - cur = CUR_CHAR(l); - - if (cur == '>') - break; - - if (cur == quote) { + if (CUR == '"') { + SKIP(1); + ret = htmlParseData(ctxt, MASK_DQ_GT, 0, 0, maxLength); + if (CUR == '"') SKIP(1); - break; - } - - if (len + 5 >= size) { - xmlChar *tmp; - - size *= 2; - tmp = (xmlChar *) xmlRealloc(buf, size); - if (tmp == NULL) { - xmlFree(buf); - htmlErrMemory(ctxt); - return(NULL); - } - buf = tmp; - } - - COPY_BUF(buf,len,cur); - if (len > maxLength) { - htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT, - "identifier too long", NULL, NULL); - xmlFree(buf); - return(NULL); - } - - NEXTL(l); + } else if (CUR == '\'') { + SKIP(1); + ret = htmlParseData(ctxt, MASK_SQ_GT, 0, 0, maxLength); + if (CUR == '\'') + SKIP(1); + } else { + return(NULL); } - buf[len] = 0; - return(buf); + return(ret); +} + +static void +htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt) { + const xmlChar *in; + size_t avail; + int eof = PARSER_PROGRESSIVE(ctxt); + int line, col; + + line = ctxt->input->line; + col = ctxt->input->col; + + in = ctxt->input->cur; + avail = ctxt->input->end - in; + + while (!PARSER_STOPPED(ctxt)) { + int cur; + + if ((!eof) && (avail <= 64)) { + size_t oldAvail = avail; + + ctxt->input->cur = in; + + xmlParserGrow(ctxt); + + in = ctxt->input->cur; + avail = ctxt->input->end - in; + + if (oldAvail == avail) + eof = 1; + } + + if (avail == 0) + break; + + col += 1; + + cur = *in; + if (cur == '>') { + in += 1; + break; + } else if (cur == 0x0A) { + line += 1; + col = 1; + } + + in += 1; + avail -= 1; + + SHRINK; + } + + ctxt->input->cur = in; + ctxt->input->line = line; + ctxt->input->col = col; } /** @@ -3488,7 +3433,6 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { xmlChar *name = NULL; xmlChar *publicId = NULL; xmlChar *URI = NULL; - int nameCap, nameSize; int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? XML_MAX_TEXT_LENGTH : XML_MAX_NAME_LENGTH; @@ -3500,60 +3444,21 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { SKIP_BLANKS; - nameCap = 0; - nameSize = 0; - while (ctxt->input->cur < ctxt->input->end) { - int l; - int c = CUR_CHAR(l); + if ((ctxt->input->cur < ctxt->input->end) && (CUR != '>')) { + name = htmlParseData(ctxt, MASK_WS_GT, 0, 0, maxLength); - if (c == '>') - break; + if ((ctxt->options & HTML_PARSE_HTML5) && (name != NULL)) { + xmlChar *cur; - if (nameSize + 5 > nameCap) { - size_t newCap = nameCap ? nameCap * 2 : 32; - xmlChar *tmp = xmlRealloc(name, newCap); - - if (tmp == NULL) { - htmlErrMemory(ctxt); - xmlFree(name); - return; + for (cur = name; *cur; cur++) { + if ((*cur >= 'A') && (*cur <= 'Z')) + *cur += 0x20; } - - name = tmp; - nameCap = newCap; } - if (c < 0x80) { - if (IS_WS_HTML(c)) - break; - - if ((ctxt->options & HTML_PARSE_HTML5) && - (c >= 'A') && (c <= 'Z')) - c += 32; - - name[nameSize++] = c; - } else { - COPY_BUF(name, nameSize, c); - } - - if (nameSize > maxLength) { - htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT, - "identifier too long", NULL, NULL); - goto bogus; - } - - NEXTL(l); + SKIP_BLANKS; } - if (name != NULL) - name[nameSize] = 0; - - /* - * Check that upper(name) == "HTML" !!!!!!!!!!!!! - */ - - SKIP_BLANKS; - /* * Check for SystemID and publicId */ @@ -3576,14 +3481,7 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { } bogus: - /* Ignore bogus content */ - while (ctxt->input->cur < ctxt->input->end) { - int c = CUR; - - NEXT; - if (c == '>') - break; - } + htmlSkipBogusDoctype(ctxt); /* * Create or update the document accordingly to the DOCTYPE @@ -3821,7 +3719,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { (PARSER_STOPPED(ctxt) == 0)) { /* unexpected-solidus-in-tag */ if (CUR == '/') { - NEXT; + SKIP(1); SKIP_BLANKS; continue; } @@ -3973,7 +3871,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) /* unexpected-solidus-in-tag */ if (CUR == '/') { - NEXT; + SKIP(1); SKIP_BLANKS; continue; } @@ -3986,7 +3884,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) } if (CUR == '>') { - NEXT; + SKIP(1); } else if ((CUR == '/') && (NXT(1) == '>')) { SKIP(2); } else {