diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6f36fc1c..f12b2806 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -40,7 +40,7 @@ gcc:c89: extends: .test variables: CONFIG: "--without-python" - CFLAGS: "-O2 -std=c89 -D_XOPEN_SOURCE=600 -Wno-error=unused-function" + CFLAGS: "-O2 -std=c89 -D_XOPEN_SOURCE=600 -Wno-error=unused-function -Wno-error=overlength-strings" gcc:minimum: extends: .test diff --git a/HTMLparser.c b/HTMLparser.c index 0107b12d..f3e04834 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -50,10 +50,6 @@ #define HTML_PARSER_BIG_BUFFER_SIZE 1000 #define HTML_PARSER_BUFFER_SIZE 100 -#define IS_WS_HTML(c) \ - (((c) == 0x20) || \ - (((c) >= 0x09) && ((c) <= 0x0D) && ((c) != 0x0B))) - #define IS_HEX_DIGIT(c) \ ((IS_ASCII_DIGIT(c)) || \ ((((c) | 0x20) >= 'a') && (((c) | 0x20) <= 'f'))) @@ -314,17 +310,15 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt) #define CUR (*ctxt->input->cur) /** - * `the` HTML parser context + * Prescan to find encoding. * - * Ty to find and encoding in the current data available in the input - * buffer this is needed to try to switch to the proper encoding when - * one face a character error. - * That's an heuristic, since it's operating outside of parsing it could - * try to use a meta which had been commented out, that's the reason it - * should only be used in case of error, not as a default. + * Try to find an encoding in the current data available in the input + * buffer. * - * @returns an encoding string or NULL if not found, the string need to - * be freed + * TODO: Implement HTML5 prescan algorithm. + * + * @param ctxt the HTML parser context + * @returns an encoding string or NULL if not found */ static xmlChar * htmlFindEncoding(xmlParserCtxtPtr ctxt) { @@ -3606,42 +3600,7 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { } /** - * Checks an http-equiv attribute from a Meta tag to detect - * the encoding - * If a new encoding is detected the parser is switched to decode - * it and pass UTF8 - * - * @param ctxt an HTML parser context - * @param attvalue the attribute value - */ -static void -htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { - const xmlChar *encoding; - xmlChar *copy; - - if (!attvalue) - return; - - encoding = xmlStrcasestr(attvalue, BAD_CAST"charset"); - if (encoding != NULL) { - encoding += 7; - } - /* - * skip blank - */ - if (encoding && IS_WS_HTML(*encoding)) - encoding = xmlStrcasestr(attvalue, BAD_CAST"="); - if (encoding && *encoding == '=') { - encoding ++; - copy = xmlStrdup(encoding); - if (copy == NULL) - htmlErrMemory(ctxt); - xmlSetDeclaredEncoding(ctxt, copy); - } -} - -/** - * Checks an attributes from a Meta tag + * Handle charset encoding in meta tag. * * @param ctxt an HTML parser context * @param atts the attributes values @@ -3650,7 +3609,7 @@ static void htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { int i; const xmlChar *att, *value; - int http = 0; + int isContentType = 0; const xmlChar *content = NULL; if ((ctxt == NULL) || (atts == NULL)) @@ -3663,23 +3622,33 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { if (value != NULL) { if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) && (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) { - http = 1; + isContentType = 1; } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) { - xmlChar *copy; + xmlChar *encoding; - copy = xmlStrdup(value); - if (copy == NULL) + encoding = xmlStrdup(value); + if (encoding == NULL) htmlErrMemory(ctxt); - xmlSetDeclaredEncoding(ctxt, copy); + xmlSetDeclaredEncoding(ctxt, encoding); } else if (!xmlStrcasecmp(att, BAD_CAST "content")) { content = value; } } att = atts[i++]; } - if ((http) && (content != NULL)) - htmlCheckEncoding(ctxt, content); + if ((isContentType) && (content != NULL)) { + htmlMetaEncodingOffsets off; + + if (htmlParseContentType(content, &off)) { + xmlChar *encoding; + + encoding = xmlStrndup(content + off.start, off.end - off.start); + if (encoding == NULL) + htmlErrMemory(ctxt); + xmlSetDeclaredEncoding(ctxt, encoding); + } + } } /** @@ -3748,7 +3717,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { const xmlChar **atts; int nbatts = 0; int maxatts; - int meta = 0; int i; int discardtag = 0; @@ -3763,8 +3731,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { name = htmlParseHTMLName(ctxt, 0).name; if (name == NULL) return; - if (xmlStrEqual(name, BAD_CAST"meta")) - meta = 1; if ((ctxt->options & HTML_PARSE_HTML5) == 0) { /* @@ -3960,8 +3926,10 @@ failed: /* * Handle specific association to the META tag */ - if (meta) + if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) && + (strcmp((char *) name, "meta") == 0)) { htmlCheckMeta(ctxt, atts); + } #endif } diff --git a/HTMLtree.c b/HTMLtree.c index 2e579b8e..8b79c12b 100644 --- a/HTMLtree.c +++ b/HTMLtree.c @@ -25,6 +25,7 @@ #include "private/buf.h" #include "private/error.h" +#include "private/html.h" #include "private/io.h" #include "private/save.h" @@ -34,265 +35,315 @@ * * ************************************************************************/ +typedef struct { + xmlAttrPtr attr; /* charset or content */ + const xmlChar *attrValue; + htmlMetaEncodingOffsets off; +} htmlMetaEncoding; + +static htmlNodePtr +htmlFindFirstChild(htmlNodePtr parent, const char *name) { + htmlNodePtr child; + + for (child = parent->children; child != NULL; child = child->next) { + if ((child->type == XML_ELEMENT_NODE) && + (child->ns == NULL) && + (xmlStrcasecmp(child->name, BAD_CAST name) == 0)) + return(child); + } + + return(NULL); +} + +static htmlNodePtr +htmlFindHead(htmlDocPtr doc) { + htmlNodePtr html; + + if (doc == NULL) + return(NULL); + + html = htmlFindFirstChild((htmlNodePtr) doc, "html"); + if (html == NULL) + return(NULL); + + return(htmlFindFirstChild(html, "head")); +} + +int +htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) { + const xmlChar *p = val; + + while (1) { + size_t start, end; + + while ((*p != 'c') && (*p != 'C')) { + if (*p == 0) + return(0); + p += 1; + } + p += 1; + + if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0) + continue; + + p += 6; + while (IS_WS_HTML(*p)) p += 1; + + if (*p != '=') + continue; + + p += 1; + while (IS_WS_HTML(*p)) p += 1; + + if (*p == 0) + return(0); + + if ((*p == '"') || (*p == '\'')) { + int quote = *p; + + p += 1; + while (IS_WS_HTML(*p)) p += 1; + + start = p - val; + end = start; + + while (*p != quote) { + if (*p == 0) + return(0); + if (!IS_WS_HTML(*p)) + end = p + 1 - val; + p += 1; + } + } else { + start = p - val; + + while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p))) + p += 1; + + end = p - val; + } + + off->start = start; + off->end = end; + off->size = p - val + strlen((char *) p); + + return(1); + } + + return(0); +} + +static xmlAttrPtr +htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) { + xmlAttrPtr attr, contentAttr = NULL; + int isContentType = 0; + + if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0) + return(NULL); + + for (attr = elem->properties; attr != NULL; attr = attr->next) { + if (attr->ns != NULL) + continue; + if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) { + *outIsContentType = 0; + return(attr); + } + if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0) + contentAttr = attr; + if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) && + (attr->children != NULL) && + (attr->children->type == XML_TEXT_NODE) && + (attr->children->next == NULL) && + (xmlStrcasecmp(attr->children->content, + BAD_CAST "Content-Type") == 0)) + isContentType = 1; + } + + if ((isContentType) && (contentAttr != NULL)) { + *outIsContentType = 1; + return(contentAttr); + } + + return(NULL); +} + +static int +htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) { + xmlAttrPtr attr; + const xmlChar *val = NULL; + int isContentType; + + if ((elem->type != XML_ELEMENT_NODE) || + (elem->ns != NULL) || + (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)) + return(0); + + attr = htmlFindMetaEncodingAttr(elem, &isContentType); + if (attr == NULL) + return(0); + + if ((attr->children != NULL) && + (attr->children->type == XML_TEXT_NODE) && + (attr->children->next == NULL) && + (attr->children->content != NULL)) + val = attr->children->content; + else + val = BAD_CAST ""; + + + if (!isContentType) { + size_t size = strlen((char *) val); + size_t start = 0; + size_t end = size; + + while ((start < size) && (IS_WS_HTML(val[start]))) + start += 1; + + while ((end > 0) && (IS_WS_HTML(val[end-1]))) + end -= 1; + + menc->attr = attr; + menc->attrValue = val; + menc->off.start = start; + menc->off.end = end; + menc->off.size = size; + + return(1); + } else { + if (htmlParseContentType(val, &menc->off)) { + menc->attr = attr; + menc->attrValue = val; + + return(1); + } + } + + return(0); +} + +static xmlChar * +htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) { + xmlChar *newVal, *p; + size_t size, oldEncSize, newEncSize; + + /* + * The pseudo "HTML" encoding only produces ASCII. + */ + if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0) + encoding = "ASCII"; + + oldEncSize = menc->off.end - menc->off.start; + newEncSize = strlen((char *) encoding); + size = menc->off.size - oldEncSize + newEncSize; + newVal = xmlMalloc(size + 1); + if (newVal == NULL) + return(NULL); + + p = newVal; + memcpy(p, menc->attrValue, menc->off.start); + p += menc->off.start; + memcpy(p, encoding, newEncSize); + p += newEncSize; + memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end); + newVal[size] = 0; + + return(newVal); +} + /** * Look up and encoding declaration in the meta tags. * - * Does not support `` yet. Only supports deprecated - * ``. - * - * The returned string points into attribute content. It should be - * copied before modifying or freeing nodes. + * The returned string points into attribute content and can contain + * trailing garbage. It should be copied before modifying or freeing + * nodes. * * @param doc the document * @returns the encoding ot NULL if not found. */ const xmlChar * htmlGetMetaEncoding(htmlDocPtr doc) { - htmlNodePtr cur; - const xmlChar *content; - const xmlChar *encoding; + htmlNodePtr head, node; - if (doc == NULL) - return(NULL); - cur = doc->children; + head = htmlFindHead(doc); + if (head == NULL) + return(NULL); - /* - * Search the html - */ - while (cur != NULL) { - if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { - if (xmlStrEqual(cur->name, BAD_CAST"html")) - break; - if (xmlStrEqual(cur->name, BAD_CAST"head")) - goto found_head; - if (xmlStrEqual(cur->name, BAD_CAST"meta")) - goto found_meta; - } - cur = cur->next; + for (node = head->children; node != NULL; node = node->next) { + htmlMetaEncoding menc; + + if (htmlParseMetaEncoding(node, &menc)) { + /* + * Returning a `const xmlChar *` only allows to return + * a suffix. In http-equiv meta tags, there could be + * more data after the charset, although it's probably + * rare in practice. + */ + return(menc.attrValue + menc.off.start); + } } - if (cur == NULL) - return(NULL); - cur = cur->children; - /* - * Search the head - */ - while (cur != NULL) { - if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { - if (xmlStrEqual(cur->name, BAD_CAST"head")) - break; - if (xmlStrEqual(cur->name, BAD_CAST"meta")) - goto found_meta; - } - cur = cur->next; - } - if (cur == NULL) - return(NULL); -found_head: - cur = cur->children; - - /* - * Search the meta elements - */ -found_meta: - while (cur != NULL) { - if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { - if (xmlStrEqual(cur->name, BAD_CAST"meta")) { - xmlAttrPtr attr = cur->properties; - int http; - const xmlChar *value; - - content = NULL; - http = 0; - while (attr != NULL) { - if ((attr->children != NULL) && - (attr->children->type == XML_TEXT_NODE) && - (attr->children->next == NULL)) { - value = attr->children->content; - if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) - && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) - http = 1; - else if ((value != NULL) - && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) - content = value; - if ((http != 0) && (content != NULL)) - goto found_content; - } - attr = attr->next; - } - } - } - cur = cur->next; - } return(NULL); - -found_content: - encoding = xmlStrstr(content, BAD_CAST"charset="); - if (encoding == NULL) - encoding = xmlStrstr(content, BAD_CAST"Charset="); - if (encoding == NULL) - encoding = xmlStrstr(content, BAD_CAST"CHARSET="); - if (encoding != NULL) { - encoding += 8; - } else { - encoding = xmlStrstr(content, BAD_CAST"charset ="); - if (encoding == NULL) - encoding = xmlStrstr(content, BAD_CAST"Charset ="); - if (encoding == NULL) - encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); - if (encoding != NULL) - encoding += 9; - } - if (encoding != NULL) { - while ((*encoding == ' ') || (*encoding == '\t')) encoding++; - } - return(encoding); } /** * Creates or updates a meta tag with an encoding declaration. * - * Does not support `` yet. Only supports deprecated - * ``. - * * NOTE: This will not change the document content encoding. * * @param doc the document * @param encoding the encoding string - * @returns 0 in case of success and -1 in case of error + * @returns 0 in case of success, 1 if no head element was found or + * arguments are invalid and -1 if memory allocation failed. */ int htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { - htmlNodePtr cur, meta = NULL, head = NULL; - const xmlChar *content = NULL; - char newcontent[100]; + htmlNodePtr head, meta; + int found = 0; - newcontent[0] = 0; + if (encoding == NULL) + return(1); - if (doc == NULL) - return(-1); + head = htmlFindHead(doc); + if (head == NULL) + return(1); - /* html isn't a real encoding it's just libxml2 way to get entities */ - if (!xmlStrcasecmp(encoding, BAD_CAST "html")) + for (meta = head->children; meta != NULL; meta = meta->next) { + htmlMetaEncoding menc; + + if (htmlParseMetaEncoding(meta, &menc)) { + xmlChar *newVal; + int ret; + + found = 1; + + newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding); + if (newVal == NULL) + return(-1); + xmlNodeSetContent((xmlNodePtr) menc.attr, NULL); + ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal); + xmlFree(newVal); + + if (ret < 0) + return(-1); + } + } + + if (found) + return(0); + + meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL); + if (meta == NULL) return(-1); - if (encoding != NULL) { - snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", - (char *)encoding); - newcontent[sizeof(newcontent) - 1] = 0; - } - - cur = doc->children; - - /* - * Search the html - */ - while (cur != NULL) { - if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { - if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) - break; - if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) - goto found_head; - if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) - goto found_meta; - } - cur = cur->next; - } - if (cur == NULL) - return(-1); - cur = cur->children; - - /* - * Search the head - */ - while (cur != NULL) { - if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { - if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) - break; - if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { - head = cur->parent; - goto found_meta; - } - } - cur = cur->next; - } - if (cur == NULL) - return(-1); -found_head: - head = cur; - if (cur->children == NULL) - goto create; - cur = cur->children; - -found_meta: - /* - * Search and update all the remaining the meta elements carrying - * encoding information - */ - while (cur != NULL) { - if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { - if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { - xmlAttrPtr attr = cur->properties; - int http; - const xmlChar *value; - - content = NULL; - http = 0; - while (attr != NULL) { - if ((attr->children != NULL) && - (attr->children->type == XML_TEXT_NODE) && - (attr->children->next == NULL)) { - value = attr->children->content; - if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) - && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) - http = 1; - else - { - if ((value != NULL) && - (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) - content = value; - } - if ((http != 0) && (content != NULL)) - break; - } - attr = attr->next; - } - if ((http != 0) && (content != NULL)) { - meta = cur; - break; - } - - } - } - cur = cur->next; - } -create: - if (meta == NULL) { - if ((encoding != NULL) && (head != NULL)) { - /* - * Create a new Meta element with the right attributes - */ - - meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); - if (head->children == NULL) - xmlAddChild(head, meta); - else - xmlAddPrevSibling(head->children, meta); - xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); - xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); - } - } else { - /* remove the meta tag if NULL is passed */ - if (encoding == NULL) { - xmlUnlinkNode(meta); - xmlFreeNode(meta); - } - /* change the document only if there is a real encoding change */ - else if (xmlStrcasestr(content, encoding) == NULL) { - xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent); - } + if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) { + xmlFreeNode(meta); + return(-1); } + if (head->children == NULL) + xmlAddChild(head, meta); + else + xmlAddPrevSibling(head->children, meta); return(0); } @@ -383,7 +434,7 @@ htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur, outbuf->written = 0; use = xmlBufUse(buf); - htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); + htmlNodeDumpInternal(outbuf, doc, cur, NULL, format); if (outbuf->error) ret = (size_t) -1; else @@ -455,7 +506,7 @@ htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, if (buf == NULL) return(-1); - htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format); + htmlNodeDumpInternal(buf, doc, cur, NULL, format); ret = xmlOutputBufferClose(buf); return(ret); @@ -479,14 +530,11 @@ htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { * Serialize an HTML node to a memory, also returning the size of * the result. It's up to the caller to free the memory. * - * WARNING: Uses the encoding from a deprecated meta tag, see - * htmlGetMetaEncoding(). This is typically undesired. If no such - * tag was found, ASCII with HTML 4.0 named character entities will + * Uses the encoding of the document. If the document has no + * encoding, ASCII with HTML 4.0 named character entities will * be used. This is inefficient compared to UTF-8 and might be * changed in a future version. * - * Use of this function is therefore DISCOURAGED in favor of - * htmlDocContentDumpFormatOutput(). * @param cur the document * @param mem OUT: the memory pointer * @param size OUT: the memory length @@ -496,7 +544,6 @@ void htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { xmlOutputBufferPtr buf; xmlCharEncodingHandlerPtr handler = NULL; - const char *encoding; xmlInitParser(); @@ -507,8 +554,7 @@ htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { if (cur == NULL) return; - encoding = (const char *) htmlGetMetaEncoding(cur); - if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK) + if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK) return; buf = xmlAllocOutputBuffer(handler); if (buf == NULL) @@ -657,18 +703,19 @@ htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) { /** * Serialize an HTML node to an output buffer. * - * Ignores `encoding` and uses the encoding of the output buffer. + * If `encoding` is specified, it is used to create or update meta + * tags containing the character encoding. + * * @param buf the HTML buffer output * @param doc the document * @param cur the current node - * @param encoding the encoding string (unused) + * @param encoding the encoding string (optional) * @param format should formatting newlines been added */ void -htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, - xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, - int format) { - xmlNodePtr root, parent; +htmlNodeDumpInternal(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, + const char *encoding, int format) { + xmlNodePtr root, parent, metaHead = NULL; xmlAttrPtr attr; const htmlElemDesc * info; @@ -699,24 +746,61 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, } break; - case XML_ELEMENT_NODE: + case XML_ELEMENT_NODE: { + htmlMetaEncoding menc; + int isMeta = 0; + int addMeta = 0; + /* * Some users like lxml are known to pass nodes with a corrupted * tree structure. Fall back to a recursive call to handle this * case. */ if ((cur->parent != parent) && (cur->children != NULL)) { - htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); + htmlNodeDumpInternal(buf, doc, cur, encoding, format); break; } /* * Get specific HTML info for that node. */ - if (cur->ns == NULL) + if (cur->ns == NULL) { info = htmlTagLookup(cur->name); - else + + if (encoding != NULL) { + isMeta = htmlParseMetaEncoding(cur, &menc); + + /* + * Don't add meta tag for "HTML" encoding. + */ + if ((xmlStrcasecmp(BAD_CAST encoding, + BAD_CAST "HTML") != 0) && + (xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) && + (parent != NULL) && + (parent->ns == NULL) && + (xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) && + (parent->parent != NULL) && + (parent->parent->parent == NULL) && + (metaHead == NULL)) { + xmlNodePtr n; + + metaHead = cur; + addMeta = 1; + + for (n = cur->children; n != NULL; n = n->next) { + int unused; + + if (htmlFindMetaEncodingAttr(n, &unused) != NULL) { + metaHead = NULL; + addMeta = 0; + break; + } + } + } + } + } else { info = NULL; + } xmlOutputBufferWriteString(buf, "<"); if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { @@ -728,7 +812,23 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNsListDumpOutput(buf, cur->nsDef); attr = cur->properties; while (attr != NULL) { - htmlAttrDumpOutput(buf, doc, attr); + if ((!isMeta) || (attr != menc.attr)) { + htmlAttrDumpOutput(buf, doc, attr); + } else { + xmlChar *newVal; + + xmlOutputBufferWriteString(buf, " "); + xmlOutputBufferWriteString(buf, (char *) attr->name); + + newVal = htmlUpdateMetaEncoding(&menc, encoding); + if (newVal == NULL) { + buf->error = XML_ERR_NO_MEMORY; + return; + } + xmlOutputBufferWriteString(buf, "="); + xmlOutputBufferWriteQuotedString(buf, newVal); + xmlFree(newVal); + } attr = attr->next; } @@ -740,7 +840,14 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { xmlOutputBufferWriteString(buf, ">"); } else { - xmlOutputBufferWriteString(buf, ">"); + if (addMeta) { + xmlOutputBufferWriteString(buf, ">"); + } else { + xmlOutputBufferWriteString(buf, ">"); + } if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); @@ -751,13 +858,25 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, } } else { xmlOutputBufferWriteString(buf, ">"); - if ((format) && (info != NULL) && (!info->isinline) && - (cur->children->type != HTML_TEXT_NODE) && - (cur->children->type != HTML_ENTITY_REF_NODE) && - (cur->children != cur->last) && - (cur->name != NULL) && - (cur->name[0] != 'p')) /* p, pre, param */ + if ((format) && + ((addMeta) || + ((info != NULL) && (!info->isinline) && + (cur->children->type != HTML_TEXT_NODE) && + (cur->children->type != HTML_ENTITY_REF_NODE) && + (cur->children != cur->last) && + (cur->name != NULL) && + (cur->name[0] != 'p')))) /* p, pre, param */ xmlOutputBufferWriteString(buf, "\n"); + if (addMeta) { + xmlOutputBufferWriteString(buf, ""); + if ((format) && + (cur->children->type != HTML_TEXT_NODE) && + (cur->children->type != HTML_ENTITY_REF_NODE)) + xmlOutputBufferWriteString(buf, "\n"); + } parent = cur; cur = cur->children; continue; @@ -774,6 +893,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, } break; + } case XML_ATTRIBUTE_NODE: htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur); @@ -862,7 +982,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, if ((format) && (info != NULL) && (!info->isinline) && (cur->last->type != HTML_TEXT_NODE) && (cur->last->type != HTML_ENTITY_REF_NODE) && - (cur->children != cur->last) && + ((cur->children != cur->last) || (cur == metaHead)) && (cur->name != NULL) && (cur->name[0] != 'p')) /* p, pre, param */ xmlOutputBufferWriteString(buf, "\n"); @@ -884,32 +1004,48 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, (parent->name[0] != 'p')) /* p, pre, param */ xmlOutputBufferWriteString(buf, "\n"); } + + if (cur == metaHead) + metaHead = NULL; } } } } +/** + * Serialize an HTML node to an output buffer. + * + * @param buf the HTML buffer output + * @param doc the document + * @param cur the current node + * @param encoding the encoding string (unused) + * @param format should formatting newlines been added + */ +void +htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, + const char *encoding ATTRIBUTE_UNUSED, int format) { + htmlNodeDumpInternal(buf, doc, cur, NULL, format); +} + /** * Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is * typically undesired. Use of this function is DISCOURAGED in favor * of htmlNodeDumpFormatOutput(). * - * Ignores `encoding` and uses the encoding of the output buffer. * @param buf the HTML buffer output * @param doc the document * @param cur the current node * @param encoding the encoding string (unused) */ void -htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, - xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) { - htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1); +htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, + const char *encoding ATTRIBUTE_UNUSED) { + htmlNodeDumpInternal(buf, doc, cur, NULL, 1); } /** * Serialize an HTML document to an output buffer. * - * Ignores `encoding` and uses the encoding of the output buffer. * @param buf the HTML buffer output * @param cur the document * @param encoding the encoding string (unused) @@ -919,31 +1055,14 @@ void htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding ATTRIBUTE_UNUSED, int format) { - int type = 0; - - /* - * This is needed when serializing XML documents as HTML. - * xmlEncodeEntitiesReentrant uses the document type to - * determine the serialization mode. - * - * Once we call more low-level functions directly with - * HTML flags, this hack can be removed. - */ - if (cur) { - type = cur->type; - cur->type = XML_HTML_DOCUMENT_NODE; - } - htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format); - if (cur) - cur->type = (xmlElementType) type; + htmlNodeDumpInternal(buf, cur, (xmlNodePtr) cur, NULL, format); } /** - * Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is - * typically undesired. Use of this function is DISCOURAGED in favor - * of htmlDocContentDumpFormatOutput(). + * Same as htmlDocContentDumpFormatDump() with `format` set to 1 + * which is typically undesired. Use of this function is DISCOURAGED + * in favor of htmlDocContentDumpFormatOutput(). * - * Ignores `encoding` and uses the encoding of the output buffer. * @param buf the HTML buffer output * @param cur the document * @param encoding the encoding string (unused) @@ -951,7 +1070,7 @@ htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, void htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding ATTRIBUTE_UNUSED) { - htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1); + htmlNodeDumpInternal(buf, cur, (xmlNodePtr) cur, NULL, 1); } /************************************************************************ @@ -963,13 +1082,12 @@ htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, /** * Serialize an HTML document to an open `FILE`. * - * WARNING: Uses the encoding from a deprecated meta tag, see - * htmlGetMetaEncoding(). This is typically undesired. If no such - * tag was found, ASCII with HTML 4.0 named character entities will + * Uses the encoding of the document. If the document has no + * encoding, ASCII with HTML 4.0 named character entities will * be used. This is inefficient compared to UTF-8 and might be * changed in a future version. * - * Also enables "formatting" unconditionally which is typically + * Enables "formatting" unconditionally which is typically * undesired. * * Use of this function is DISCOURAGED in favor of @@ -983,7 +1101,6 @@ int htmlDocDump(FILE *f, xmlDocPtr cur) { xmlOutputBufferPtr buf; xmlCharEncodingHandlerPtr handler = NULL; - const char *encoding; int ret; xmlInitParser(); @@ -992,8 +1109,7 @@ htmlDocDump(FILE *f, xmlDocPtr cur) { return(-1); } - encoding = (const char *) htmlGetMetaEncoding(cur); - if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK) + if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK) return(-1); buf = xmlOutputBufferCreateFile(f, handler); if (buf == NULL) @@ -1005,18 +1121,10 @@ htmlDocDump(FILE *f, xmlDocPtr cur) { } /** - * Serialize an HTML document to a file. If `filename` is `"-"`, - * stdout is used. This is potentially insecure and might be - * changed in a future version. + * Serialize an HTML document to a file. * - * WARNING: Uses the encoding from a deprecated meta tag, see - * htmlGetMetaEncoding(). This is typically undesired. If no such - * tag was found, ASCII with HTML 4.0 named character entities will - * be used. This is inefficient compared to UTF-8 and might be - * changed in a future version. - * - * Also enables "formatting" unconditionally which is typically - * undesired. + * Same as htmlSaveFileFormat() with `encoding` set to NULL and + * `format` set to 1 which is typically undesired. * * Use of this function is DISCOURAGED in favor of * htmlSaveFileFormat(). @@ -1027,31 +1135,12 @@ htmlDocDump(FILE *f, xmlDocPtr cur) { */ int htmlSaveFile(const char *filename, xmlDocPtr cur) { - xmlOutputBufferPtr buf; - xmlCharEncodingHandlerPtr handler = NULL; - const char *encoding; - int ret; - - if ((cur == NULL) || (filename == NULL)) - return(-1); - - xmlInitParser(); - - encoding = (const char *) htmlGetMetaEncoding(cur); - if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK) - return(-1); - buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); - if (buf == NULL) - return(-1); - - htmlDocContentDumpOutput(buf, cur, NULL); - - ret = xmlOutputBufferClose(buf); - return(ret); + return(htmlSaveFileFormat(filename, cur, NULL, 1)); } /** * Serialize an HTML document to a file using a given encoding. + * * If `filename` is `"-"`, stdout is used. This is potentially * insecure and might be changed in a future version. * @@ -1059,6 +1148,8 @@ htmlSaveFile(const char *filename, xmlDocPtr cur) { * will be used. This is inefficient compared to UTF-8 and might be * changed in a future version. * + * Sets or updates meta tags containing the character encoding. + * * @param filename the filename * @param cur the document * @param format should formatting newlines been added @@ -1079,15 +1170,11 @@ htmlSaveFileFormat(const char *filename, xmlDocPtr cur, if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK) return(-1); - if (handler != NULL) - htmlSetMetaEncoding(cur, (const xmlChar *) handler->name); - else - htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); /* * save the content to a temp buffer. */ - buf = xmlOutputBufferCreateFilename(filename, handler, 0); + buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); if (buf == NULL) return(0); @@ -1098,6 +1185,8 @@ htmlSaveFileFormat(const char *filename, xmlDocPtr cur, } /** + * Serialize an HTML document to a file. + * * Same as htmlSaveFileFormat() with `format` set to 1 which is * typically undesired. Also see the warnings there. Use of this * function is DISCOURAGED in favor of htmlSaveFileFormat(). diff --git a/include/private/html.h b/include/private/html.h index 415be221..e5590edc 100644 --- a/include/private/html.h +++ b/include/private/html.h @@ -5,9 +5,26 @@ #ifdef LIBXML_HTML_ENABLED +#define IS_WS_HTML(c) \ + (((c) == 0x20) || \ + (((c) >= 0x09) && ((c) <= 0x0D) && ((c) != 0x0B))) + +typedef struct { + size_t start; + size_t end; + size_t size; +} htmlMetaEncodingOffsets; + XML_HIDDEN xmlNodePtr htmlCtxtParseContentInternal(xmlParserCtxtPtr ctxt, xmlParserInputPtr input); +XML_HIDDEN int +htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off); + +XML_HIDDEN void +htmlNodeDumpInternal(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, + const char *encoding, int format); + #endif /* LIBXML_HTML_ENABLED */ #endif /* XML_HTML_H_PRIVATE__ */ diff --git a/python/tests/serialize.py b/python/tests/serialize.py index 4666ec46..11db04c0 100755 --- a/python/tests/serialize.py +++ b/python/tests/serialize.py @@ -77,17 +77,14 @@ if str != """ -
hello
+hello
""": print("error serializing HTML document 2") sys.exit(1) str = doc.serialize(format=1) if str != """ - - -hello
""": @@ -97,13 +94,13 @@ str = doc.serialize("iso-8859-1", 1) if str != """ - +hello
""": - print("error serializing HTML document 4") + print("error serializing HTML document 4", str) sys.exit(1) # @@ -116,15 +113,12 @@ if str != """hello
hello
""": +if str != """hello
""": print("error serializing HTML root 2") sys.exit(1) str = root.serialize(format=1) if str != """ - - -hello
""": print("error serializing HTML root 3") @@ -132,7 +126,7 @@ if str != """ str = root.serialize("iso-8859-1", 1) if str != """ - +hello
diff --git a/result/HTML/758518-entity.html b/result/HTML/758518-entity.html index 7dde7c6c..bfc2b664 100644 --- a/result/HTML/758518-entity.html +++ b/result/HTML/758518-entity.html @@ -1,2 +1,2 @@ -&jÙ +&jÙ diff --git a/result/HTML/758518-tag.html b/result/HTML/758518-tag.html index aa0ad342..de8ff580 100644 --- a/result/HTML/758518-tag.html +++ b/result/HTML/758518-tag.html @@ -1,2 +1,2 @@ - + diff --git a/result/HTML/758605.html b/result/HTML/758605.html index 77f70a00..97ea22a4 100644 --- a/result/HTML/758605.html +++ b/result/HTML/758605.html @@ -1,3 +1,3 @@ -&:ê +&:ê diff --git a/result/HTML/758606_2.html b/result/HTML/758606_2.html index 1258fccd..f9b69b4d 100644 --- a/result/HTML/758606_2.html +++ b/result/HTML/758606_2.html @@ -1,3 +1,3 @@ - diff --git a/result/HTML/fp40.htm b/result/HTML/fp40.htm index 8affc19d..42ce90aa 100644 --- a/result/HTML/fp40.htm +++ b/result/HTML/fp40.htm @@ -11,7 +11,7 @@The FrontPage Server Extensions are a set of programs on the Web server that support: @@ -24,11 +24,11 @@ -
+
-ResourceConfig /dev/null@@ -160,7 +160,7 @@ answering inquiries, so you can write your question in your own words. To begin, -
+ResourceConfig /dev/null
AccessConfig /dev/null
+
diff --git a/result/HTML/html5_enc.html b/result/HTML/html5_enc.html index 44ceebca..30edf290 100644 --- a/result/HTML/html5_enc.html +++ b/result/HTML/html5_enc.html @@ -4,6 +4,6 @@ -
très
+très
diff --git a/result/HTML/wired.html b/result/HTML/wired.html index 7441fc81..f9c4018d 100644 --- a/result/HTML/wired.html +++ b/result/HTML/wired.html @@ -91,7 +91,7 @@+ | HITS & MISC. | |
+ |
![]() @@ -434,12 +434,12 @@ or PointCast | |
+ | CURRENT HOO-HA | |
+ |
![]() @@ -466,26 +466,26 @@ or PointCast Y2K Watch Tick... Tick... Tick... -More Hoo-Ha +More Hoo-Ha |
|
+ | MEANWHILE... | |
+ |
![]() -Führer Furor
-Contruction workers in Berlin opened an old wound in the German psyche this week when they accidentally stumbled across Adolf Hitler's bunker while excavating near the Brandenburg Gate. The bunker, just south of the Gate, was where Hitler and his closest associates barricaded themselves as the Red Army approached Berlin in the waning days of World War II. It is also where the Führer and his bride, Eva Braun, committed suicide rather than fall into the hands of the Russians. Although the bunker's location has never been a mystery, it has been sealed off since the end of the war to keep neo-Nazis from turning it into a shrine.
+Führer Furor
+Contruction workers in Berlin opened an old wound in the German psyche this week when they accidentally stumbled across Adolf Hitler's bunker while excavating near the Brandenburg Gate. The bunker, just south of the Gate, was where Hitler and his closest associates barricaded themselves as the Red Army approached Berlin in the waning days of World War II. It is also where the Führer and his bride, Eva Braun, committed suicide rather than fall into the hands of the Russians. Although the bunker's location has never been a mystery, it has been sealed off since the end of the war to keep neo-Nazis from turning it into a shrine.
|
Send us feedback
- |
+ |
Work at Wired Digital
- |
+ |
Advertise with us
About Wired Digital
- |
+ |
Our Privacy Policy
Copyright © 1994-99 Wired Digital Inc. All rights reserved. +
Copyright © 1994-99 Wired Digital Inc. All rights reserved.
diff --git a/result/HTML/xml-declaration-1.html b/result/HTML/xml-declaration-1.html
index 1d0ca6c0..e4e9c35a 100644
--- a/result/HTML/xml-declaration-1.html
+++ b/result/HTML/xml-declaration-1.html
@@ -1,4 +1,4 @@
öäüß
+öäüß
diff --git a/testparser.c b/testparser.c index aacdd679..24264bcc 100644 --- a/testparser.c +++ b/testparser.c @@ -14,6 +14,7 @@ #include