mirror of
				https://gitlab.gnome.org/GNOME/libxml2.git
				synced 2025-10-26 00:37:43 +03:00 
			
		
		
		
	Add options to ignore the internal encoding
For both XML and HTML, the document can provide an encoding either in XMLDecl in XML, or as a meta element in HTML head. This adds options to ignore those encodings if the encoding is known in advace for example if the content had been converted before being passed to the parser. * parser.c include/libxml/parser.h: add XML_PARSE_IGNORE_ENC option for XML parsing * include/libxml/HTMLparser.h HTMLparser.c: adds the HTML_PARSE_IGNORE_ENC for HTML parsing * HTMLtree.c: fix the handling of saving when an unknown encoding is defined in meta document header * xmllint.c: add a --noenc option to activate the new parser options
This commit is contained in:
		
							
								
								
									
										11
									
								
								HTMLparser.c
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								HTMLparser.c
									
									
									
									
									
								
							| @@ -3448,7 +3448,8 @@ static void | |||||||
| htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { | htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { | ||||||
|     const xmlChar *encoding; |     const xmlChar *encoding; | ||||||
|  |  | ||||||
|     if ((ctxt == NULL) || (attvalue == NULL)) |     if ((ctxt == NULL) || (attvalue == NULL) || | ||||||
|  |         (ctxt->options & HTML_PARSE_IGNORE_ENC)) | ||||||
| 	return; | 	return; | ||||||
|  |  | ||||||
|     /* do not change encoding */ |     /* do not change encoding */ | ||||||
| @@ -3500,7 +3501,9 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { | |||||||
| 		xmlSwitchToEncoding(ctxt, handler); | 		xmlSwitchToEncoding(ctxt, handler); | ||||||
| 		ctxt->charset = XML_CHAR_ENCODING_UTF8; | 		ctxt->charset = XML_CHAR_ENCODING_UTF8; | ||||||
| 	    } else { | 	    } else { | ||||||
| 		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; | 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, | ||||||
|  | 		             "htmlCheckEncoding: unknown encoding %s\n", | ||||||
|  | 			     encoding, NULL); | ||||||
| 	    } | 	    } | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| @@ -6537,6 +6540,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) | |||||||
| 	ctxt->options |= HTML_PARSE_NODEFDTD; | 	ctxt->options |= HTML_PARSE_NODEFDTD; | ||||||
|         options -= HTML_PARSE_NODEFDTD; |         options -= HTML_PARSE_NODEFDTD; | ||||||
|     } |     } | ||||||
|  |     if (options & HTML_PARSE_IGNORE_ENC) { | ||||||
|  | 	ctxt->options |= HTML_PARSE_IGNORE_ENC; | ||||||
|  |         options -= HTML_PARSE_IGNORE_ENC; | ||||||
|  |     } | ||||||
|     ctxt->dictNames = 0; |     ctxt->dictNames = 0; | ||||||
|     return (options); |     return (options); | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										16
									
								
								HTMLtree.c
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								HTMLtree.c
									
									
									
									
									
								
							| @@ -481,7 +481,7 @@ htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, | |||||||
| 	if (enc != XML_CHAR_ENCODING_UTF8) { | 	if (enc != XML_CHAR_ENCODING_UTF8) { | ||||||
| 	    handler = xmlFindCharEncodingHandler(encoding); | 	    handler = xmlFindCharEncodingHandler(encoding); | ||||||
| 	    if (handler == NULL) | 	    if (handler == NULL) | ||||||
| 		return(-1); | 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); | ||||||
| 	} | 	} | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -562,11 +562,9 @@ htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { | |||||||
| 	    } | 	    } | ||||||
|  |  | ||||||
| 	    handler = xmlFindCharEncodingHandler(encoding); | 	    handler = xmlFindCharEncodingHandler(encoding); | ||||||
| 	    if (handler == NULL) { | 	    if (handler == NULL) | ||||||
| 		*mem = NULL; |                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); | ||||||
| 		*size = 0; |  | ||||||
| 		return; |  | ||||||
| 	    } |  | ||||||
| 	} else { | 	} else { | ||||||
| 	    handler = xmlFindCharEncodingHandler(encoding); | 	    handler = xmlFindCharEncodingHandler(encoding); | ||||||
| 	} | 	} | ||||||
| @@ -1061,7 +1059,7 @@ htmlDocDump(FILE *f, xmlDocPtr cur) { | |||||||
|  |  | ||||||
| 	    handler = xmlFindCharEncodingHandler(encoding); | 	    handler = xmlFindCharEncodingHandler(encoding); | ||||||
| 	    if (handler == NULL) | 	    if (handler == NULL) | ||||||
| 		return(-1); | 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); | ||||||
| 	} else { | 	} else { | ||||||
| 	    handler = xmlFindCharEncodingHandler(encoding); | 	    handler = xmlFindCharEncodingHandler(encoding); | ||||||
| 	} | 	} | ||||||
| @@ -1120,7 +1118,7 @@ htmlSaveFile(const char *filename, xmlDocPtr cur) { | |||||||
|  |  | ||||||
| 	    handler = xmlFindCharEncodingHandler(encoding); | 	    handler = xmlFindCharEncodingHandler(encoding); | ||||||
| 	    if (handler == NULL) | 	    if (handler == NULL) | ||||||
| 		return(-1); | 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); | ||||||
| 	} | 	} | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -1181,7 +1179,7 @@ htmlSaveFileFormat(const char *filename, xmlDocPtr cur, | |||||||
|  |  | ||||||
| 	    handler = xmlFindCharEncodingHandler(encoding); | 	    handler = xmlFindCharEncodingHandler(encoding); | ||||||
| 	    if (handler == NULL) | 	    if (handler == NULL) | ||||||
| 		return(-1); | 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); | ||||||
| 	} | 	} | ||||||
|         htmlSetMetaEncoding(cur, (const xmlChar *) encoding); |         htmlSetMetaEncoding(cur, (const xmlChar *) encoding); | ||||||
|     } else { |     } else { | ||||||
|   | |||||||
| @@ -184,7 +184,8 @@ typedef enum { | |||||||
|     HTML_PARSE_NOBLANKS	= 1<<8,	/* remove blank nodes */ |     HTML_PARSE_NOBLANKS	= 1<<8,	/* remove blank nodes */ | ||||||
|     HTML_PARSE_NONET	= 1<<11,/* Forbid network access */ |     HTML_PARSE_NONET	= 1<<11,/* Forbid network access */ | ||||||
|     HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ |     HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ | ||||||
|     HTML_PARSE_COMPACT  = 1<<16 /* compact small text nodes */ |     HTML_PARSE_COMPACT  = 1<<16,/* compact small text nodes */ | ||||||
|  |     HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */ | ||||||
| } htmlParserOption; | } htmlParserOption; | ||||||
|  |  | ||||||
| XMLPUBFUN void XMLCALL | XMLPUBFUN void XMLCALL | ||||||
|   | |||||||
| @@ -1105,8 +1105,9 @@ typedef enum { | |||||||
| 				   crash if you try to modify the tree) */ | 				   crash if you try to modify the tree) */ | ||||||
|     XML_PARSE_OLD10	= 1<<17,/* parse using XML-1.0 before update 5 */ |     XML_PARSE_OLD10	= 1<<17,/* parse using XML-1.0 before update 5 */ | ||||||
|     XML_PARSE_NOBASEFIX = 1<<18,/* do not fixup XINCLUDE xml:base uris */ |     XML_PARSE_NOBASEFIX = 1<<18,/* do not fixup XINCLUDE xml:base uris */ | ||||||
|     XML_PARSE_HUGE      = 1<<19, /* relax any hardcoded limit from the parser */ |     XML_PARSE_HUGE      = 1<<19,/* relax any hardcoded limit from the parser */ | ||||||
|     XML_PARSE_OLDSAX    = 1<<20 /* parse using SAX2 interface from before 2.7.0 */ |     XML_PARSE_OLDSAX    = 1<<20,/* parse using SAX2 interface before 2.7.0 */ | ||||||
|  |     XML_PARSE_IGNORE_ENC= 1<<21 /* ignore internal document encoding hint */ | ||||||
| } xmlParserOption; | } xmlParserOption; | ||||||
|  |  | ||||||
| XMLPUBFUN void XMLCALL | XMLPUBFUN void XMLCALL | ||||||
|   | |||||||
							
								
								
									
										11
									
								
								parser.c
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								parser.c
									
									
									
									
									
								
							| @@ -9922,6 +9922,13 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) { | |||||||
| 	} else { | 	} else { | ||||||
| 	    xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL); | 	    xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  |         /* | ||||||
|  |          * Non standard parsing, allowing the user to ignore encoding | ||||||
|  |          */ | ||||||
|  |         if (ctxt->options & XML_PARSE_IGNORE_ENC) | ||||||
|  |             return(encoding); | ||||||
|  |  | ||||||
| 	/* | 	/* | ||||||
| 	 * UTF-16 encoding stwich has already taken place at this stage, | 	 * UTF-16 encoding stwich has already taken place at this stage, | ||||||
| 	 * more over the little-endian/big-endian selection is already done | 	 * more over the little-endian/big-endian selection is already done | ||||||
| @@ -14561,6 +14568,10 @@ xmlCtxtUseOptionsInternal(xmlParserCtxtPtr ctxt, int options, const char *encodi | |||||||
| 	ctxt->options |= XML_PARSE_OLDSAX; | 	ctxt->options |= XML_PARSE_OLDSAX; | ||||||
|         options -= XML_PARSE_OLDSAX; |         options -= XML_PARSE_OLDSAX; | ||||||
|     } |     } | ||||||
|  |     if (options & XML_PARSE_IGNORE_ENC) { | ||||||
|  | 	ctxt->options |= XML_PARSE_IGNORE_ENC; | ||||||
|  |         options -= XML_PARSE_IGNORE_ENC; | ||||||
|  |     } | ||||||
|     ctxt->linenumbers = 1; |     ctxt->linenumbers = 1; | ||||||
|     return (options); |     return (options); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -130,6 +130,7 @@ static int copy = 0; | |||||||
| #endif /* LIBXML_TREE_ENABLED */ | #endif /* LIBXML_TREE_ENABLED */ | ||||||
| static int recovery = 0; | static int recovery = 0; | ||||||
| static int noent = 0; | static int noent = 0; | ||||||
|  | static int noenc = 0; | ||||||
| static int noblanks = 0; | static int noblanks = 0; | ||||||
| static int noout = 0; | static int noout = 0; | ||||||
| static int nowrap = 0; | static int nowrap = 0; | ||||||
| @@ -2983,6 +2984,7 @@ static void usage(const char *name) { | |||||||
|     printf("\t--recover : output what was parsable on broken XML documents\n"); |     printf("\t--recover : output what was parsable on broken XML documents\n"); | ||||||
|     printf("\t--huge : remove any internal arbitrary parser limits\n"); |     printf("\t--huge : remove any internal arbitrary parser limits\n"); | ||||||
|     printf("\t--noent : substitute entity references by their value\n"); |     printf("\t--noent : substitute entity references by their value\n"); | ||||||
|  |     printf("\t--noenc : ignore any encoding specified inside the document\n"); | ||||||
|     printf("\t--noout : don't output the result tree\n"); |     printf("\t--noout : don't output the result tree\n"); | ||||||
|     printf("\t--path 'paths': provide a set of paths for resources\n"); |     printf("\t--path 'paths': provide a set of paths for resources\n"); | ||||||
|     printf("\t--load-trace : print trace of all external entites loaded\n"); |     printf("\t--load-trace : print trace of all external entites loaded\n"); | ||||||
| @@ -3137,6 +3139,10 @@ main(int argc, char **argv) { | |||||||
| 	         (!strcmp(argv[i], "--noent"))) { | 	         (!strcmp(argv[i], "--noent"))) { | ||||||
| 	    noent++; | 	    noent++; | ||||||
| 	    options |= XML_PARSE_NOENT; | 	    options |= XML_PARSE_NOENT; | ||||||
|  | 	} else if ((!strcmp(argv[i], "-noenc")) || | ||||||
|  | 	         (!strcmp(argv[i], "--noenc"))) { | ||||||
|  | 	    noenc++; | ||||||
|  | 	    options |= XML_PARSE_IGNORE_ENC; | ||||||
| 	} else if ((!strcmp(argv[i], "-nsclean")) || | 	} else if ((!strcmp(argv[i], "-nsclean")) || | ||||||
| 	         (!strcmp(argv[i], "--nsclean"))) { | 	         (!strcmp(argv[i], "--nsclean"))) { | ||||||
| 	    options |= XML_PARSE_NSCLEAN; | 	    options |= XML_PARSE_NSCLEAN; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user