1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-26 00:37:43 +03:00
Files
libxml2/testchar.c
Nick Wellnhofer 59b3366178 error: Limit number of parser errors
Reporting errors is expensive and some abusive test cases can generate
an error for each invalid input byte. This causes the parser to spend
most of the time with error handling. Limit the number of errors and
warnings to 100.
2022-12-27 14:41:19 +01:00

690 lines
17 KiB
C

/**
* Test the UTF-8 decoding routines
*
* author: Daniel Veillard
* copy: see Copyright for the status of this software.
*/
#include <stdio.h>
#include <string.h>
#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/parserInternals.h>
int lastError;
static void errorHandler(void *unused, xmlErrorPtr err) {
if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
lastError = err->code;
}
}
char document1[100] = "<doc>XXXX</doc>";
char document2[100] = "<doc foo='XXXX'/>";
static int testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
int len, char *data, int forbid1, int forbid2) {
int i;
xmlDocPtr res;
for (i = 0;i <= 0xFF;i++) {
lastError = 0;
xmlCtxtReset(ctxt);
data[0] = (char) i;
res = xmlReadMemory(document, len, "test", NULL, 0);
if ((i == forbid1) || (i == forbid2)) {
if ((lastError == 0) || (res != NULL)) {
fprintf(stderr,
"Failed to detect invalid char for Byte 0x%02X: %c\n",
i, i);
return(1);
}
}
else if ((i == '<') || (i == '&')) {
if ((lastError == 0) || (res != NULL)) {
fprintf(stderr,
"Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
return(1);
}
}
else if (((i < 0x20) || (i >= 0x80)) &&
(i != 0x9) && (i != 0xA) && (i != 0xD)) {
if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL)) {
fprintf(stderr,
"Failed to detect invalid char for Byte 0x%02X\n", i);
return(1);
}
}
else if (res == NULL) {
fprintf(stderr,
"Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
return(1);
}
if (res != NULL)
xmlFreeDoc(res);
}
return(0);
}
static int testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
int len, char *data) {
int i, j;
xmlDocPtr res;
for (i = 0x80;i <= 0xFF;i++) {
for (j = 0;j <= 0xFF;j++) {
lastError = 0;
xmlCtxtReset(ctxt);
data[0] = (char) i;
data[1] = (char) j;
res = xmlReadMemory(document, len, "test", NULL, 0);
/* if first bit of first char is set, then second bit must too */
if ((i & 0x80) && ((i & 0x40) == 0)) {
if ((lastError == 0) || (res != NULL)) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
i, j);
return(1);
}
}
/*
* if first bit of first char is set, then second char first
* bits must be 10
*/
else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
if ((lastError == 0) || (res != NULL)) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
i, j);
return(1);
}
}
/*
* if using a 2 byte encoding then the value must be greater
* than 0x80, i.e. one of bits 5 to 1 of i must be set
*/
else if ((i & 0x80) && ((i & 0x1E) == 0)) {
if ((lastError == 0) || (res != NULL)) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
i, j);
return(1);
}
}
/*
* if third bit of first char is set, then the sequence would need
* at least 3 bytes, but we give only 2 !
*/
else if ((i & 0xE0) == 0xE0) {
if ((lastError == 0) || (res != NULL)) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
i, j);
return(1);
}
}
/*
* We should see no error in remaining cases
*/
else if ((lastError != 0) || (res == NULL)) {
fprintf(stderr,
"Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
return(1);
}
if (res != NULL)
xmlFreeDoc(res);
}
}
return(0);
}
/**
* testDocumentRanges:
*
* Test the correct UTF8 character parsing in context of XML documents
* Those are in-context injection tests checking the parser behaviour on
* edge case values at different point in content, beginning and end of
* CDATA in text or in attribute values.
*/
static int testDocumentRanges(void) {
xmlParserCtxtPtr ctxt;
char *data;
int test_ret = 0;
/*
* Set up a parsing context using the first document as
* the current input source.
*/
ctxt = xmlNewParserCtxt();
if (ctxt == NULL) {
fprintf(stderr, "Failed to allocate parser context\n");
return(1);
}
printf("testing 1 byte char in document: 1");
fflush(stdout);
data = &document1[5];
data[0] = ' ';
data[1] = ' ';
data[2] = ' ';
data[3] = ' ';
/* test 1 byte injection at beginning of area */
test_ret += testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
data, -1, -1);
printf(" 2");
fflush(stdout);
data[0] = ' ';
data[1] = ' ';
data[2] = ' ';
data[3] = ' ';
/* test 1 byte injection at end of area */
test_ret += testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
data + 3, -1, -1);
printf(" 3");
fflush(stdout);
data = &document2[10];
data[0] = ' ';
data[1] = ' ';
data[2] = ' ';
data[3] = ' ';
/* test 1 byte injection at beginning of area */
test_ret += testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
data, '\'', -1);
printf(" 4");
fflush(stdout);
data[0] = ' ';
data[1] = ' ';
data[2] = ' ';
data[3] = ' ';
/* test 1 byte injection at end of area */
test_ret += testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
data + 3, '\'', -1);
printf(" done\n");
printf("testing 2 byte char in document: 1");
fflush(stdout);
data = &document1[5];
data[0] = ' ';
data[1] = ' ';
data[2] = ' ';
data[3] = ' ';
/* test 2 byte injection at beginning of area */
test_ret += testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
data);
printf(" 2");
fflush(stdout);
data[0] = ' ';
data[1] = ' ';
data[2] = ' ';
data[3] = ' ';
/* test 2 byte injection at end of area */
test_ret += testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
data + 2);
printf(" 3");
fflush(stdout);
data = &document2[10];
data[0] = ' ';
data[1] = ' ';
data[2] = ' ';
data[3] = ' ';
/* test 2 byte injection at beginning of area */
test_ret += testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
data);
printf(" 4");
fflush(stdout);
data[0] = ' ';
data[1] = ' ';
data[2] = ' ';
data[3] = ' ';
/* test 2 byte injection at end of area */
test_ret += testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
data + 2);
printf(" done\n");
xmlFreeParserCtxt(ctxt);
return(test_ret);
}
static int testCharRangeByte1(xmlParserCtxtPtr ctxt) {
int i = 0;
int len, c;
char *data = (char *) ctxt->input->cur;
data[1] = 0;
data[2] = 0;
data[3] = 0;
for (i = 0;i <= 0xFF;i++) {
data[0] = (char) i;
ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->nbErrors = 0;
lastError = 0;
c = xmlCurrentChar(ctxt, &len);
if ((i == 0) || (i >= 0x80)) {
/* we must see an error there */
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char for Byte 0x%02X\n", i);
return(1);
}
} else if (i == 0xD) {
if ((c != 0xA) || (len != 1)) {
fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
return(1);
}
} else if ((c != i) || (len != 1)) {
fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
return(1);
}
}
return(0);
}
static int testCharRangeByte2(xmlParserCtxtPtr ctxt) {
int i, j;
int len, c;
char *data = (char *) ctxt->input->cur;
data[2] = 0;
data[3] = 0;
for (i = 0x80;i <= 0xFF;i++) {
for (j = 0;j <= 0xFF;j++) {
data[0] = (char) i;
data[1] = (char) j;
ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->nbErrors = 0;
lastError = 0;
c = xmlCurrentChar(ctxt, &len);
/* if first bit of first char is set, then second bit must too */
if ((i & 0x80) && ((i & 0x40) == 0)) {
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
i, j);
return(1);
}
}
/*
* if first bit of first char is set, then second char first
* bits must be 10
*/
else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
i, j, c);
return(1);
}
}
/*
* if using a 2 byte encoding then the value must be greater
* than 0x80, i.e. one of bits 5 to 1 of i must be set
*/
else if ((i & 0x80) && ((i & 0x1E) == 0)) {
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
i, j, c);
return(1);
}
}
/*
* if third bit of first char is set, then the sequence would need
* at least 3 bytes, but we give only 2 !
*/
else if ((i & 0xE0) == 0xE0) {
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
i, j);
return(1);
}
}
/*
* We should see no error in remaining cases
*/
else if ((lastError != 0) || (len != 2)) {
fprintf(stderr,
"Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
return(1);
}
/*
* Finally check the value is right
*/
else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
fprintf(stderr,
"Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
return(1);
}
}
}
return(0);
}
static int testCharRangeByte3(xmlParserCtxtPtr ctxt) {
int i, j, k, K;
int len, c;
unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
char *data = (char *) ctxt->input->cur;
int value;
data[3] = 0;
for (i = 0xE0;i <= 0xFF;i++) {
for (j = 0;j <= 0xFF;j++) {
for (k = 0;k < 6;k++) {
data[0] = (char) i;
data[1] = (char) j;
K = lows[k];
data[2] = (char) K;
value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->nbErrors = 0;
lastError = 0;
c = xmlCurrentChar(ctxt, &len);
/*
* if fourth bit of first char is set, then the sequence would need
* at least 4 bytes, but we give only 3 !
*/
if ((i & 0xF0) == 0xF0) {
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
i, j, K, data[3]);
return(1);
}
}
/*
* The second and the third bytes must start with 10
*/
else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
i, j, K);
return(1);
}
}
/*
* if using a 3 byte encoding then the value must be greater
* than 0x800, i.e. one of bits 4 to 0 of i must be set or
* the 6th byte of data[1] must be set
*/
else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
i, j, K);
return(1);
}
}
/*
* There are values in that range that are not allowed in XML-1.0
*/
else if (((value > 0xD7FF) && (value <0xE000)) ||
((value > 0xFFFD) && (value <0x10000))) {
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
value, i, j, K);
return(1);
}
}
/*
* We should see no error in remaining cases
*/
else if ((lastError != 0) || (len != 3)) {
fprintf(stderr,
"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
i, j, K);
return(1);
}
/*
* Finally check the value is right
*/
else if (c != value) {
fprintf(stderr,
"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
i, j, data[2], value, c);
return(1);
}
}
}
}
return(0);
}
static int testCharRangeByte4(xmlParserCtxtPtr ctxt) {
int i, j, k, K, l, L;
int len, c;
unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
char *data = (char *) ctxt->input->cur;
int value;
data[4] = 0;
for (i = 0xF0;i <= 0xFF;i++) {
for (j = 0;j <= 0xFF;j++) {
for (k = 0;k < 6;k++) {
for (l = 0;l < 6;l++) {
data[0] = (char) i;
data[1] = (char) j;
K = lows[k];
data[2] = (char) K;
L = lows[l];
data[3] = (char) L;
value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
((i & 0x7) << 18);
ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->nbErrors = 0;
lastError = 0;
c = xmlCurrentChar(ctxt, &len);
/*
* if fifth bit of first char is set, then the sequence would need
* at least 5 bytes, but we give only 4 !
*/
if ((i & 0xF8) == 0xF8) {
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
i, j, K, data[3]);
return(1);
}
}
/*
* The second, third and fourth bytes must start with 10
*/
else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
((L & 0xC0) != 0x80)) {
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
i, j, K, L);
return(1);
}
}
/*
* if using a 3 byte encoding then the value must be greater
* than 0x10000, i.e. one of bits 3 to 0 of i must be set or
* the 6 or 5th byte of j must be set
*/
else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
i, j, K, L);
return(1);
}
}
/*
* There are values in that range that are not allowed in XML-1.0
*/
else if (((value > 0xD7FF) && (value <0xE000)) ||
((value > 0xFFFD) && (value <0x10000)) ||
(value > 0x10FFFF)) {
if (lastError != XML_ERR_INVALID_CHAR) {
fprintf(stderr,
"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
value, i, j, K, L);
return(1);
}
}
/*
* We should see no error in remaining cases
*/
else if ((lastError != 0) || (len != 4)) {
fprintf(stderr,
"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
i, j, K);
return(1);
}
/*
* Finally check the value is right
*/
else if (c != value) {
fprintf(stderr,
"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
i, j, data[2], value, c);
return(1);
}
}
}
}
}
return(0);
}
/**
* testCharRanges:
*
* Test the correct UTF8 character parsing in isolation i.e.
* not when parsing a full document, this is less expensive and we can
* cover the full range of UTF-8 chars accepted by XML-1.0
*/
static int testCharRanges(void) {
char data[5];
xmlParserCtxtPtr ctxt;
xmlParserInputBufferPtr buf;
xmlParserInputPtr input;
int test_ret = 0;
memset(data, 0, 5);
/*
* Set up a parsing context using the above data buffer as
* the current input source.
*/
ctxt = xmlNewParserCtxt();
if (ctxt == NULL) {
fprintf(stderr, "Failed to allocate parser context\n");
return(1);
}
buf = xmlParserInputBufferCreateMem(data, sizeof(data),
XML_CHAR_ENCODING_NONE);
if (buf == NULL) {
fprintf(stderr, "Failed to allocate input buffer\n");
test_ret = 1;
goto error;
}
input = xmlNewInputStream(ctxt);
if (input == NULL) {
xmlFreeParserInputBuffer(buf);
test_ret = 1;
goto error;
}
input->filename = NULL;
input->buf = buf;
input->cur =
input->base = xmlBufContent(input->buf->buffer);
input->end = input->base + 4;
inputPush(ctxt, input);
printf("testing char range: 1");
fflush(stdout);
test_ret += testCharRangeByte1(ctxt);
printf(" 2");
fflush(stdout);
test_ret += testCharRangeByte2(ctxt);
printf(" 3");
fflush(stdout);
test_ret += testCharRangeByte3(ctxt);
printf(" 4");
fflush(stdout);
test_ret += testCharRangeByte4(ctxt);
printf(" done\n");
fflush(stdout);
error:
xmlFreeParserCtxt(ctxt);
return(test_ret);
}
int main(void) {
int ret = 0;
/*
* this initialize the library and check potential ABI mismatches
* between the version it was compiled for and the actual shared
* library used.
*/
LIBXML_TEST_VERSION
/*
* Catch errors separately
*/
xmlSetStructuredErrorFunc(NULL, errorHandler);
/*
* Run the tests
*/
ret += testCharRanges();
ret += testDocumentRanges();
/*
* Cleanup function for the XML library.
*/
xmlCleanupParser();
/*
* this is to debug memory for regression tests
*/
xmlMemoryDump();
return(ret ? 1 : 0);
}