mirror of
				https://gitlab.gnome.org/GNOME/libxml2.git
				synced 2025-10-24 13:33:01 +03:00 
			
		
		
		
	html: Map some encodings according to HTML5
Windows-1252 is a superset of ISO-8859-1 and should be used instead. Same for ASCII. Also map UCS-2 and UTF-16 to UTF-16LE.
This commit is contained in:
		| @@ -2745,7 +2745,8 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask, | |||||||
|                     guess = htmlFindEncoding(ctxt); |                     guess = htmlFindEncoding(ctxt); | ||||||
| #endif | #endif | ||||||
|                     if (guess == NULL) { |                     if (guess == NULL) { | ||||||
|                         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); |                         xmlSwitchEncoding(ctxt, | ||||||
|  |                                 XML_CHAR_ENCODING_WINDOWS_1252); | ||||||
|                     } else { |                     } else { | ||||||
|                         xmlSwitchEncodingName(ctxt, (const char *) guess); |                         xmlSwitchEncodingName(ctxt, (const char *) guess); | ||||||
|                         xmlFree(guess); |                         xmlFree(guess); | ||||||
| @@ -3288,7 +3289,8 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) { | |||||||
|                     guess = htmlFindEncoding(ctxt); |                     guess = htmlFindEncoding(ctxt); | ||||||
| #endif | #endif | ||||||
|                     if (guess == NULL) { |                     if (guess == NULL) { | ||||||
|                         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); |                         xmlSwitchEncoding(ctxt, | ||||||
|  |                                 XML_CHAR_ENCODING_WINDOWS_1252); | ||||||
|                     } else { |                     } else { | ||||||
|                         xmlSwitchEncodingName(ctxt, (const char *) guess); |                         xmlSwitchEncodingName(ctxt, (const char *) guess); | ||||||
|                         xmlFree(guess); |                         xmlFree(guess); | ||||||
|   | |||||||
							
								
								
									
										20
									
								
								encoding.c
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								encoding.c
									
									
									
									
									
								
							| @@ -1087,6 +1087,26 @@ xmlCreateCharEncodingHandler(const char *name, xmlCharEncFlags flags, | |||||||
|         return(XML_ERR_OK); |         return(XML_ERR_OK); | ||||||
|  |  | ||||||
|     if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) { |     if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) { | ||||||
|  |         if (flags & XML_ENC_HTML) { | ||||||
|  |             /* | ||||||
|  |              * TODO: HTML5 only allows a fixed set of charset | ||||||
|  |              * labels. We should add an option to enable or | ||||||
|  |              * disable this restriction. | ||||||
|  |              * | ||||||
|  |              * TODO: Map ISO-8859-9 to windows-1254. | ||||||
|  |              */ | ||||||
|  |             switch (enc) { | ||||||
|  |                 case XML_CHAR_ENCODING_ASCII: | ||||||
|  |                 case XML_CHAR_ENCODING_8859_1: | ||||||
|  |                     enc = XML_CHAR_ENCODING_WINDOWS_1252; | ||||||
|  |                     break; | ||||||
|  |                 case XML_CHAR_ENCODING_UCS2: | ||||||
|  |                 case XML_CHAR_ENCODING_UTF16: | ||||||
|  |                     enc = XML_CHAR_ENCODING_UTF16LE; | ||||||
|  |                     break; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|         handler = &defaultHandlers[enc]; |         handler = &defaultHandlers[enc]; | ||||||
|         if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) && |         if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) && | ||||||
|             (((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) { |             (((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) { | ||||||
|   | |||||||
| @@ -119,7 +119,9 @@ typedef enum { | |||||||
|     /** Create converter for input (conversion to UTF-8) */ |     /** Create converter for input (conversion to UTF-8) */ | ||||||
|     XML_ENC_INPUT = (1 << 0), |     XML_ENC_INPUT = (1 << 0), | ||||||
|     /** Create converter for output (conversion from UTF-8) */ |     /** Create converter for output (conversion from UTF-8) */ | ||||||
|     XML_ENC_OUTPUT = (1 << 1) |     XML_ENC_OUTPUT = (1 << 1), | ||||||
|  |     /** Use HTML5 mappings */ | ||||||
|  |     XML_ENC_HTML = (1 << 2) | ||||||
| } xmlCharEncFlags; | } xmlCharEncFlags; | ||||||
|  |  | ||||||
| /** | /** | ||||||
|   | |||||||
| @@ -1229,11 +1229,14 @@ xmlSwitchInputEncodingName(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, | |||||||
|                            const char *encoding) { |                            const char *encoding) { | ||||||
|     xmlCharEncodingHandlerPtr handler; |     xmlCharEncodingHandlerPtr handler; | ||||||
|     xmlParserErrors res; |     xmlParserErrors res; | ||||||
|  |     xmlCharEncFlags flags = XML_ENC_INPUT; | ||||||
|  |  | ||||||
|     if (encoding == NULL) |     if (encoding == NULL) | ||||||
|         return(-1); |         return(-1); | ||||||
|  |  | ||||||
|     res = xmlCreateCharEncodingHandler(encoding, XML_ENC_INPUT, |     if (ctxt->html) | ||||||
|  |         flags |= XML_ENC_HTML; | ||||||
|  |     res = xmlCreateCharEncodingHandler(encoding, flags, | ||||||
|             ctxt->convImpl, ctxt->convCtxt, &handler); |             ctxt->convImpl, ctxt->convCtxt, &handler); | ||||||
|     if (res == XML_ERR_UNSUPPORTED_ENCODING) { |     if (res == XML_ERR_UNSUPPORTED_ENCODING) { | ||||||
|         xmlWarningMsg(ctxt, XML_ERR_UNSUPPORTED_ENCODING, |         xmlWarningMsg(ctxt, XML_ERR_UNSUPPORTED_ENCODING, | ||||||
| @@ -1569,14 +1572,17 @@ xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) { | |||||||
|         ((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) { |         ((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) { | ||||||
|         xmlCharEncodingHandlerPtr handler; |         xmlCharEncodingHandlerPtr handler; | ||||||
|         xmlParserErrors res; |         xmlParserErrors res; | ||||||
|  |         xmlCharEncFlags flags = XML_ENC_INPUT; | ||||||
|  |  | ||||||
|         /* |         /* | ||||||
|          * xmlSwitchEncodingName treats unsupported encodings as |          * xmlSwitchEncodingName treats unsupported encodings as | ||||||
|          * warnings, but we want it to be an error in an encoding |          * warnings, but we want it to be an error in an encoding | ||||||
|          * declaration. |          * declaration. | ||||||
|          */ |          */ | ||||||
|  |         if (ctxt->html) | ||||||
|  |             flags |= XML_ENC_HTML; | ||||||
|         res = xmlCreateCharEncodingHandler((const char *) encoding, |         res = xmlCreateCharEncodingHandler((const char *) encoding, | ||||||
|                 XML_ENC_INPUT, ctxt->convImpl, ctxt->convCtxt, &handler); |                 flags, ctxt->convImpl, ctxt->convCtxt, &handler); | ||||||
|         if (res != XML_ERR_OK) { |         if (res != XML_ERR_OK) { | ||||||
|             xmlFatalErr(ctxt, res, (const char *) encoding); |             xmlFatalErr(ctxt, res, (const char *) encoding); | ||||||
|             xmlFree(encoding); |             xmlFree(encoding); | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| SAX.setDocumentLocator() | SAX.setDocumentLocator() | ||||||
| SAX.startDocument() | SAX.startDocument() | ||||||
| SAX.comment(?a) | SAX.comment(?a“) | ||||||
| SAX.endDocument() | SAX.endDocument() | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| SAX.setDocumentLocator() | SAX.setDocumentLocator() | ||||||
| SAX.startDocument() | SAX.startDocument() | ||||||
| SAX.comment(<!dOctYPE | SAX.comment(‘<!dOctYPE | ||||||
| ) | ) | ||||||
| SAX.endDocument() | SAX.endDocument() | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user