/*
* HTMLtree.c : implementation of access function for an HTML tree.
*
* See Copyright for the status of this software.
*
* Author: Daniel Veillard
*/
#define IN_LIBXML
#include "libxml.h"
#ifdef LIBXML_HTML_ENABLED
#include /* for memset() only ! */
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "private/buf.h"
#include "private/html.h"
#include "private/error.h"
#include "private/html.h"
#include "private/io.h"
#include "private/save.h"
#include "private/tree.h"
/************************************************************************
* *
* Getting/Setting encoding meta tags *
* *
************************************************************************/
typedef struct {
xmlAttrPtr attr; /* charset or content */
const xmlChar *attrValue;
htmlMetaEncodingOffsets off;
} htmlMetaEncoding;
static htmlNodePtr
htmlFindFirstChild(htmlNodePtr parent, const char *name) {
htmlNodePtr child;
for (child = parent->children; child != NULL; child = child->next) {
if ((child->type == XML_ELEMENT_NODE) &&
(child->ns == NULL) &&
(xmlStrcasecmp(child->name, BAD_CAST name) == 0))
return(child);
}
return(NULL);
}
static htmlNodePtr
htmlFindHead(htmlDocPtr doc) {
htmlNodePtr html;
if (doc == NULL)
return(NULL);
html = htmlFindFirstChild((htmlNodePtr) doc, "html");
if (html == NULL)
return(NULL);
return(htmlFindFirstChild(html, "head"));
}
int
htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) {
const xmlChar *p = val;
while (1) {
size_t start, end;
while ((*p != 'c') && (*p != 'C')) {
if (*p == 0)
return(0);
p += 1;
}
p += 1;
if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0)
continue;
p += 6;
while (IS_WS_HTML(*p)) p += 1;
if (*p != '=')
continue;
p += 1;
while (IS_WS_HTML(*p)) p += 1;
if (*p == 0)
return(0);
if ((*p == '"') || (*p == '\'')) {
int quote = *p;
p += 1;
while (IS_WS_HTML(*p)) p += 1;
start = p - val;
end = start;
while (*p != quote) {
if (*p == 0)
return(0);
if (!IS_WS_HTML(*p))
end = p + 1 - val;
p += 1;
}
} else {
start = p - val;
while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p)))
p += 1;
end = p - val;
}
off->start = start;
off->end = end;
off->size = p - val + strlen((char *) p);
return(1);
}
return(0);
}
static xmlAttrPtr
htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) {
xmlAttrPtr attr, contentAttr = NULL;
int isContentType = 0;
if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)
return(NULL);
for (attr = elem->properties; attr != NULL; attr = attr->next) {
if (attr->ns != NULL)
continue;
if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) {
*outIsContentType = 0;
return(attr);
}
if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0)
contentAttr = attr;
if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) &&
(attr->children != NULL) &&
(attr->children->type == XML_TEXT_NODE) &&
(attr->children->next == NULL) &&
(xmlStrcasecmp(attr->children->content,
BAD_CAST "Content-Type") == 0))
isContentType = 1;
}
if ((isContentType) && (contentAttr != NULL)) {
*outIsContentType = 1;
return(contentAttr);
}
return(NULL);
}
static int
htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) {
xmlAttrPtr attr;
const xmlChar *val = NULL;
int isContentType;
if ((elem->type != XML_ELEMENT_NODE) ||
(elem->ns != NULL) ||
(xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0))
return(0);
attr = htmlFindMetaEncodingAttr(elem, &isContentType);
if (attr == NULL)
return(0);
if ((attr->children != NULL) &&
(attr->children->type == XML_TEXT_NODE) &&
(attr->children->next == NULL) &&
(attr->children->content != NULL))
val = attr->children->content;
else
val = BAD_CAST "";
if (!isContentType) {
size_t size = strlen((char *) val);
size_t start = 0;
size_t end = size;
while ((start < size) && (IS_WS_HTML(val[start])))
start += 1;
while ((end > 0) && (IS_WS_HTML(val[end-1])))
end -= 1;
menc->attr = attr;
menc->attrValue = val;
menc->off.start = start;
menc->off.end = end;
menc->off.size = size;
return(1);
} else {
if (htmlParseContentType(val, &menc->off)) {
menc->attr = attr;
menc->attrValue = val;
return(1);
}
}
return(0);
}
static xmlChar *
htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) {
xmlChar *newVal, *p;
size_t size, oldEncSize, newEncSize;
/*
* The pseudo "HTML" encoding only produces ASCII.
*/
if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0)
encoding = "ASCII";
oldEncSize = menc->off.end - menc->off.start;
newEncSize = strlen((char *) encoding);
size = menc->off.size - oldEncSize + newEncSize;
newVal = xmlMalloc(size + 1);
if (newVal == NULL)
return(NULL);
p = newVal;
memcpy(p, menc->attrValue, menc->off.start);
p += menc->off.start;
memcpy(p, encoding, newEncSize);
p += newEncSize;
memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end);
newVal[size] = 0;
return(newVal);
}
/**
* Look up and encoding declaration in the meta tags.
*
* The returned string points into attribute content and can contain
* trailing garbage. It should be copied before modifying or freeing
* nodes.
*
* @param doc the document
* @returns the encoding ot NULL if not found.
*/
const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc) {
htmlNodePtr head, node;
head = htmlFindHead(doc);
if (head == NULL)
return(NULL);
for (node = head->children; node != NULL; node = node->next) {
htmlMetaEncoding menc;
if (htmlParseMetaEncoding(node, &menc)) {
/*
* Returning a `const xmlChar *` only allows to return
* a suffix. In http-equiv meta tags, there could be
* more data after the charset, although it's probably
* rare in practice.
*/
return(menc.attrValue + menc.off.start);
}
}
return(NULL);
}
/**
* Creates or updates a meta tag with an encoding declaration.
*
* NOTE: This will not change the document content encoding.
*
* @param doc the document
* @param encoding the encoding string
* @returns 0 in case of success, 1 if no head element was found or
* arguments are invalid and -1 if memory allocation failed.
*/
int
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
htmlNodePtr head, meta;
int found = 0;
if (encoding == NULL)
return(1);
head = htmlFindHead(doc);
if (head == NULL)
return(1);
for (meta = head->children; meta != NULL; meta = meta->next) {
htmlMetaEncoding menc;
if (htmlParseMetaEncoding(meta, &menc)) {
xmlChar *newVal;
int ret;
found = 1;
newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding);
if (newVal == NULL)
return(-1);
xmlNodeSetContent((xmlNodePtr) menc.attr, NULL);
ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal);
xmlFree(newVal);
if (ret < 0)
return(-1);
}
}
if (found)
return(0);
meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL);
if (meta == NULL)
return(-1);
if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) {
xmlFreeNode(meta);
return(-1);
}
if (head->children == NULL)
xmlAddChild(head, meta);
else
xmlAddPrevSibling(head->children, meta);
return(0);
}
/**
* These are the HTML attributes which will be output
* in minimized form, i.e. `