mirror of
				https://gitlab.gnome.org/GNOME/libxml2.git
				synced 2025-10-24 13:33:01 +03:00 
			
		
		
		
	html: Make implied <p> tags more deterministic
libxml2's HTML parser adds <p> start tags in some situations. This behavior, which doesn't follow any standard, was added in 2000, see here: http://veillard.com/XML/messages/0655.html Text nodes that only contain whitespace don't imply a <p> tag, but the whitespace check cannot work reliably if we're parsing partial text data which can happen with both pull and push parser. The logic in `areBlanks` is hard to follow. The checks involving `CUR` depend on the position of the input pointer and seem dubious. It's also possible that the behavior changed inadvertently with a later commit. As a result, it's hard to come up with good test cases. We now process leading whitespace before creating implied tags. This is more in line with HTML5 and should avoid at least some issues with partial text data. For example, parsing the string "<head> x" used to result in: <html> <head></head> <body><p> x</p></body> </html> And now results in: <html> <head> </head> <body><p>x</p></body> </html> Except for the implied <p> tag, this matches HTML5.
This commit is contained in:
		
							
								
								
									
										40
									
								
								HTMLparser.c
									
									
									
									
									
								
							
							
						
						
									
										40
									
								
								HTMLparser.c
									
									
									
									
									
								
							| @@ -2965,16 +2965,44 @@ htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf, | |||||||
|  |  | ||||||
|     if ((mode == 0) || (mode == DATA_RCDATA) || |     if ((mode == 0) || (mode == DATA_RCDATA) || | ||||||
|         (ctxt->sax->cdataBlock == NULL)) { |         (ctxt->sax->cdataBlock == NULL)) { | ||||||
|         int blank = areBlanks(ctxt, buf, size); |         if ((ctxt->name == NULL) || | ||||||
|  |             (xmlStrEqual(ctxt->name, BAD_CAST "html")) || | ||||||
|  |             (xmlStrEqual(ctxt->name, BAD_CAST "head"))) { | ||||||
|  |             int i; | ||||||
|  |  | ||||||
|         if ((mode == 0) && (blank > 0) && (!ctxt->keepBlanks)) { |             /* | ||||||
|  |              * Add leading whitespace to html or head elements before | ||||||
|  |              * calling htmlCheckParagraph. | ||||||
|  |              */ | ||||||
|  |             for (i = 0; i < size; i++) | ||||||
|  |                 if (!IS_WS_HTML(buf[i])) | ||||||
|  |                     break; | ||||||
|  |  | ||||||
|  |             if (i > 0) { | ||||||
|  |                 if (!ctxt->keepBlanks) { | ||||||
|                     if (ctxt->sax->ignorableWhitespace != NULL) |                     if (ctxt->sax->ignorableWhitespace != NULL) | ||||||
|                 ctxt->sax->ignorableWhitespace(ctxt->userData, |                         ctxt->sax->ignorableWhitespace(ctxt->userData, buf, i); | ||||||
|                                                buf, size); |  | ||||||
|                 } else { |                 } else { | ||||||
|             if ((mode == 0) && (blank < 0)) |                     if (ctxt->sax->characters != NULL) | ||||||
|                 htmlCheckParagraph(ctxt); |                         ctxt->sax->characters(ctxt->userData, buf, i); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 buf += i; | ||||||
|  |                 size -= i; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             if (size <= 0) | ||||||
|  |                 return; | ||||||
|  |  | ||||||
|  |             htmlCheckParagraph(ctxt); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         if ((mode == 0) && | ||||||
|  |             (!ctxt->keepBlanks) && | ||||||
|  |             (areBlanks(ctxt, buf, size))) { | ||||||
|  |             if (ctxt->sax->ignorableWhitespace != NULL) | ||||||
|  |                 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, size); | ||||||
|  |         } else { | ||||||
|             if (ctxt->sax->characters != NULL) |             if (ctxt->sax->characters != NULL) | ||||||
|                 ctxt->sax->characters(ctxt->userData, buf, size); |                 ctxt->sax->characters(ctxt->userData, buf, size); | ||||||
|         } |         } | ||||||
|   | |||||||
							
								
								
									
										6
									
								
								result/HTML/implied1.html
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								result/HTML/implied1.html
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,6 @@ | |||||||
|  | <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"> | ||||||
|  | <html> | ||||||
|  | <head>   </head> | ||||||
|  | <body><p>x | ||||||
|  | </p></body> | ||||||
|  | </html> | ||||||
							
								
								
									
										14
									
								
								result/HTML/implied1.html.sax
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								result/HTML/implied1.html.sax
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | |||||||
|  | SAX.setDocumentLocator() | ||||||
|  | SAX.startDocument() | ||||||
|  | SAX.startElement(html) | ||||||
|  | SAX.startElement(head) | ||||||
|  | SAX.characters(   , 3) | ||||||
|  | SAX.endElement(head) | ||||||
|  | SAX.startElement(body) | ||||||
|  | SAX.startElement(p) | ||||||
|  | SAX.characters(x | ||||||
|  | , 2) | ||||||
|  | SAX.endElement(p) | ||||||
|  | SAX.endElement(body) | ||||||
|  | SAX.endElement(html) | ||||||
|  | SAX.endDocument() | ||||||
							
								
								
									
										1
									
								
								test/HTML/implied1.html
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								test/HTML/implied1.html
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | |||||||
|  | <head>   x | ||||||
		Reference in New Issue
	
	Block a user