1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-14 20:01:04 +03:00

Mostly HTML generation and parsing enhancements:

- HTMLparser.[ch] testHTML.c: applied the second set of
  patches from Wayne Davison <wayned@blorf.net>, adding
  htmlEncodeEntities()
- HTMLparser.c: fixed an ignorable white space detection bug
  occuring when parsing with SAX only
- result/HTML/*.sax: updated since the output is now HTML
  encoded...
Daniel.
This commit is contained in:
Daniel Veillard
2000-08-28 10:04:51 +00:00
parent 47f3f31f83
commit e010c17d78
18 changed files with 571 additions and 580 deletions

View File

@ -1341,7 +1341,7 @@ UTF8ToHtml(unsigned char* out, int *outlen,
/* assertion: c is a single UTF-4 value */
if (c < 0x80) {
if (out >= outend)
if (out + 1 >= outend)
break;
*out++ = c;
} else {
@ -1360,7 +1360,7 @@ UTF8ToHtml(unsigned char* out, int *outlen,
return(-2);
}
len = strlen(ent->name);
if (out + 2 + len > outend)
if (out + 2 + len >= outend)
break;
*out++ = '&';
memcpy(out, ent->name, len);
@ -1374,6 +1374,99 @@ UTF8ToHtml(unsigned char* out, int *outlen,
return(0);
}
/**
* htmlEncodeEntities:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of UTF-8 chars
* @inlen: the length of @in
* @quoteChar: the quote character to escape (' or ") or zero.
*
* Take a block of UTF-8 chars in and try to convert it to an ASCII
* plus HTML entities block of chars out.
*
* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
* The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictiable.
* The value of @outlen after return is the number of octets consumed.
*/
int
htmlEncodeEntities(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen, int quoteChar) {
const unsigned char* processed = in;
const unsigned char* outend = out + (*outlen);
const unsigned char* outstart = out;
const unsigned char* instart = in;
const unsigned char* inend = in + (*inlen);
unsigned int c, d;
int trailing;
while (in < inend) {
d = *in++;
if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) {
/* trailing byte in leading position */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else {
/* no chance for this in Ascii */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
}
if (inend - in < trailing)
break;
while (trailing--) {
if (((d= *in++) & 0xC0) != 0x80) {
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
}
c <<= 6;
c |= d & 0x3F;
}
/* assertion: c is a single UTF-4 value */
if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
if (out >= outend)
break;
*out++ = c;
} else {
htmlEntityDescPtr ent;
const char *cp;
char nbuf[16];
int len;
/*
* Try to lookup a predefined HTML entity for it
*/
ent = htmlEntityValueLookup(c);
if (ent == NULL) {
sprintf(nbuf, "#%u", c);
cp = nbuf;
}
else
cp = ent->name;
len = strlen(cp);
if (out + 2 + len > outend)
break;
*out++ = '&';
memcpy(out, cp, len);
out += len;
*out++ = ';';
}
processed = in;
}
*outlen = out - outstart;
*inlen = processed - instart;
return(0);
}
/**
* htmlDecodeEntities:
@ -1555,6 +1648,12 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
if (CUR == 0) return(1);
if (CUR != '<') return(0);
if (ctxt->name == NULL)
return(1);
if (!xmlStrcmp(ctxt->name, BAD_CAST"head"))
return(1);
if (!xmlStrcmp(ctxt->name, BAD_CAST"body"))
return(1);
if (ctxt->node == NULL) return(0);
lastChild = xmlGetLastChild(ctxt->node);
if (lastChild == NULL) {