mirror of
				https://gitlab.gnome.org/GNOME/libxml2.git
				synced 2025-10-24 13:33:01 +03:00 
			
		
		
		
	html: Map some encodings according to HTML5
Windows-1252 is a superset of ISO-8859-1 and should be used instead. Same for ASCII. Also map UCS-2 and UTF-16 to UTF-16LE.
This commit is contained in:
		| @@ -2745,7 +2745,8 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask, | ||||
|                     guess = htmlFindEncoding(ctxt); | ||||
| #endif | ||||
|                     if (guess == NULL) { | ||||
|                         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); | ||||
|                         xmlSwitchEncoding(ctxt, | ||||
|                                 XML_CHAR_ENCODING_WINDOWS_1252); | ||||
|                     } else { | ||||
|                         xmlSwitchEncodingName(ctxt, (const char *) guess); | ||||
|                         xmlFree(guess); | ||||
| @@ -3288,7 +3289,8 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) { | ||||
|                     guess = htmlFindEncoding(ctxt); | ||||
| #endif | ||||
|                     if (guess == NULL) { | ||||
|                         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); | ||||
|                         xmlSwitchEncoding(ctxt, | ||||
|                                 XML_CHAR_ENCODING_WINDOWS_1252); | ||||
|                     } else { | ||||
|                         xmlSwitchEncodingName(ctxt, (const char *) guess); | ||||
|                         xmlFree(guess); | ||||
|   | ||||
							
								
								
									
										20
									
								
								encoding.c
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								encoding.c
									
									
									
									
									
								
							| @@ -1087,6 +1087,26 @@ xmlCreateCharEncodingHandler(const char *name, xmlCharEncFlags flags, | ||||
|         return(XML_ERR_OK); | ||||
|  | ||||
|     if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) { | ||||
|         if (flags & XML_ENC_HTML) { | ||||
|             /* | ||||
|              * TODO: HTML5 only allows a fixed set of charset | ||||
|              * labels. We should add an option to enable or | ||||
|              * disable this restriction. | ||||
|              * | ||||
|              * TODO: Map ISO-8859-9 to windows-1254. | ||||
|              */ | ||||
|             switch (enc) { | ||||
|                 case XML_CHAR_ENCODING_ASCII: | ||||
|                 case XML_CHAR_ENCODING_8859_1: | ||||
|                     enc = XML_CHAR_ENCODING_WINDOWS_1252; | ||||
|                     break; | ||||
|                 case XML_CHAR_ENCODING_UCS2: | ||||
|                 case XML_CHAR_ENCODING_UTF16: | ||||
|                     enc = XML_CHAR_ENCODING_UTF16LE; | ||||
|                     break; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         handler = &defaultHandlers[enc]; | ||||
|         if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) && | ||||
|             (((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) { | ||||
|   | ||||
| @@ -119,7 +119,9 @@ typedef enum { | ||||
|     /** Create converter for input (conversion to UTF-8) */ | ||||
|     XML_ENC_INPUT = (1 << 0), | ||||
|     /** Create converter for output (conversion from UTF-8) */ | ||||
|     XML_ENC_OUTPUT = (1 << 1) | ||||
|     XML_ENC_OUTPUT = (1 << 1), | ||||
|     /** Use HTML5 mappings */ | ||||
|     XML_ENC_HTML = (1 << 2) | ||||
| } xmlCharEncFlags; | ||||
|  | ||||
| /** | ||||
|   | ||||
| @@ -1229,11 +1229,14 @@ xmlSwitchInputEncodingName(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, | ||||
|                            const char *encoding) { | ||||
|     xmlCharEncodingHandlerPtr handler; | ||||
|     xmlParserErrors res; | ||||
|     xmlCharEncFlags flags = XML_ENC_INPUT; | ||||
|  | ||||
|     if (encoding == NULL) | ||||
|         return(-1); | ||||
|  | ||||
|     res = xmlCreateCharEncodingHandler(encoding, XML_ENC_INPUT, | ||||
|     if (ctxt->html) | ||||
|         flags |= XML_ENC_HTML; | ||||
|     res = xmlCreateCharEncodingHandler(encoding, flags, | ||||
|             ctxt->convImpl, ctxt->convCtxt, &handler); | ||||
|     if (res == XML_ERR_UNSUPPORTED_ENCODING) { | ||||
|         xmlWarningMsg(ctxt, XML_ERR_UNSUPPORTED_ENCODING, | ||||
| @@ -1569,14 +1572,17 @@ xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) { | ||||
|         ((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) { | ||||
|         xmlCharEncodingHandlerPtr handler; | ||||
|         xmlParserErrors res; | ||||
|         xmlCharEncFlags flags = XML_ENC_INPUT; | ||||
|  | ||||
|         /* | ||||
|          * xmlSwitchEncodingName treats unsupported encodings as | ||||
|          * warnings, but we want it to be an error in an encoding | ||||
|          * declaration. | ||||
|          */ | ||||
|         if (ctxt->html) | ||||
|             flags |= XML_ENC_HTML; | ||||
|         res = xmlCreateCharEncodingHandler((const char *) encoding, | ||||
|                 XML_ENC_INPUT, ctxt->convImpl, ctxt->convCtxt, &handler); | ||||
|                 flags, ctxt->convImpl, ctxt->convCtxt, &handler); | ||||
|         if (res != XML_ERR_OK) { | ||||
|             xmlFatalErr(ctxt, res, (const char *) encoding); | ||||
|             xmlFree(encoding); | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| SAX.setDocumentLocator() | ||||
| SAX.startDocument() | ||||
| SAX.comment(?a) | ||||
| SAX.comment(?a“) | ||||
| SAX.endDocument() | ||||
|   | ||||
| @@ -1,5 +1,5 @@ | ||||
| SAX.setDocumentLocator() | ||||
| SAX.startDocument() | ||||
| SAX.comment(<!dOctYPE | ||||
| SAX.comment(‘<!dOctYPE | ||||
| ) | ||||
| SAX.endDocument() | ||||
|   | ||||
		Reference in New Issue
	
	Block a user