/* * HTMLtree.c : implementation of access function for an HTML tree. * * See Copyright for the status of this software. * * Author: Daniel Veillard */ #define IN_LIBXML #include "libxml.h" #ifdef LIBXML_HTML_ENABLED #include /* for memset() only ! */ #include #include #include #include #include #include #include #include #include #include "private/buf.h" #include "private/html.h" #include "private/error.h" #include "private/html.h" #include "private/io.h" #include "private/save.h" #include "private/tree.h" /************************************************************************ * * * Getting/Setting encoding meta tags * * * ************************************************************************/ typedef struct { xmlAttrPtr attr; /* charset or content */ const xmlChar *attrValue; htmlMetaEncodingOffsets off; } htmlMetaEncoding; static htmlNodePtr htmlFindFirstChild(htmlNodePtr parent, const char *name) { htmlNodePtr child; for (child = parent->children; child != NULL; child = child->next) { if ((child->type == XML_ELEMENT_NODE) && (xmlStrcasecmp(child->name, BAD_CAST name) == 0)) return(child); } return(NULL); } static htmlNodePtr htmlFindHead(htmlDocPtr doc) { htmlNodePtr html; if (doc == NULL) return(NULL); html = htmlFindFirstChild((htmlNodePtr) doc, "html"); if (html == NULL) return(NULL); return(htmlFindFirstChild(html, "head")); } int htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) { const xmlChar *p = val; while (1) { size_t start, end; while ((*p != 'c') && (*p != 'C')) { if (*p == 0) return(0); p += 1; } p += 1; if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0) continue; p += 6; while (IS_WS_HTML(*p)) p += 1; if (*p != '=') continue; p += 1; while (IS_WS_HTML(*p)) p += 1; if (*p == 0) return(0); if ((*p == '"') || (*p == '\'')) { int quote = *p; p += 1; while (IS_WS_HTML(*p)) p += 1; start = p - val; end = start; while (*p != quote) { if (*p == 0) return(0); if (!IS_WS_HTML(*p)) end = p + 1 - val; p += 1; } } else { start = p - val; while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p))) p += 1; end = p - val; } off->start = start; off->end = end; off->size = p - val + strlen((char *) p); return(1); } return(0); } static xmlAttrPtr htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) { xmlAttrPtr attr, contentAttr = NULL; int isContentType = 0; if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0) return(NULL); for (attr = elem->properties; attr != NULL; attr = attr->next) { if (attr->ns != NULL) continue; if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) { *outIsContentType = 0; return(attr); } if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0) contentAttr = attr; if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) && (attr->children != NULL) && (attr->children->type == XML_TEXT_NODE) && (attr->children->next == NULL) && (xmlStrcasecmp(attr->children->content, BAD_CAST "Content-Type") == 0)) isContentType = 1; } if ((isContentType) && (contentAttr != NULL)) { *outIsContentType = 1; return(contentAttr); } return(NULL); } static int htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) { xmlAttrPtr attr; const xmlChar *val = NULL; int isContentType; if ((elem->type != XML_ELEMENT_NODE) || (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)) return(0); attr = htmlFindMetaEncodingAttr(elem, &isContentType); if (attr == NULL) return(0); if ((attr->children != NULL) && (attr->children->type == XML_TEXT_NODE) && (attr->children->next == NULL) && (attr->children->content != NULL)) val = attr->children->content; else val = BAD_CAST ""; if (!isContentType) { size_t size = strlen((char *) val); size_t start = 0; size_t end = size; while ((start < size) && (IS_WS_HTML(val[start]))) start += 1; while ((end > 0) && (IS_WS_HTML(val[end-1]))) end -= 1; menc->attr = attr; menc->attrValue = val; menc->off.start = start; menc->off.end = end; menc->off.size = size; return(1); } else { if (htmlParseContentType(val, &menc->off)) { menc->attr = attr; menc->attrValue = val; return(1); } } return(0); } static xmlChar * htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) { xmlChar *newVal, *p; size_t size, oldEncSize, newEncSize; /* * The pseudo "HTML" encoding only produces ASCII. */ if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0) encoding = "ASCII"; oldEncSize = menc->off.end - menc->off.start; newEncSize = strlen((char *) encoding); size = menc->off.size - oldEncSize + newEncSize; newVal = xmlMalloc(size + 1); if (newVal == NULL) return(NULL); p = newVal; memcpy(p, menc->attrValue, menc->off.start); p += menc->off.start; memcpy(p, encoding, newEncSize); p += newEncSize; memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end); newVal[size] = 0; return(newVal); } /** * Look up and encoding declaration in the meta tags. * * The returned string points into attribute content and can contain * trailing garbage. It should be copied before modifying or freeing * nodes. * * @param doc the document * @returns the encoding ot NULL if not found. */ const xmlChar * htmlGetMetaEncoding(htmlDocPtr doc) { htmlNodePtr head, node; head = htmlFindHead(doc); if (head == NULL) return(NULL); for (node = head->children; node != NULL; node = node->next) { htmlMetaEncoding menc; if (htmlParseMetaEncoding(node, &menc)) { /* * Returning a `const xmlChar *` only allows to return * a suffix. In http-equiv meta tags, there could be * more data after the charset, although it's probably * rare in practice. */ return(menc.attrValue + menc.off.start); } } return(NULL); } /** * Creates or updates a meta tag with an encoding declaration. * * NOTE: This will not change the document content encoding. * * @param doc the document * @param encoding the encoding string * @returns 0 in case of success, 1 if no head element was found or * arguments are invalid and -1 if memory allocation failed. */ int htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { htmlNodePtr head, meta; int found = 0; if (encoding == NULL) return(1); head = htmlFindHead(doc); if (head == NULL) return(1); for (meta = head->children; meta != NULL; meta = meta->next) { htmlMetaEncoding menc; if (htmlParseMetaEncoding(meta, &menc)) { xmlChar *newVal; int ret; found = 1; newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding); if (newVal == NULL) return(-1); xmlNodeSetContent((xmlNodePtr) menc.attr, NULL); ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal); xmlFree(newVal); if (ret < 0) return(-1); } } if (found) return(0); meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL); if (meta == NULL) return(-1); if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) { xmlFreeNode(meta); return(-1); } if (head->children == NULL) xmlAddChild(head, meta); else xmlAddPrevSibling(head->children, meta); return(0); } /** * Determine if a given attribute is a boolean attribute. This * doesn't handle HTML5. * * @deprecated Internal function, don't use. * * @param name the name of the attribute to check * @returns false if the attribute is not boolean, true otherwise. */ int htmlIsBooleanAttr(const xmlChar *name) { const char *str = NULL; if (name == NULL) return(0); /* * These are the HTML attributes which will be output * in minimized form, i.e. `