mirror of
				https://gitlab.gnome.org/GNOME/libxml2.git
				synced 2025-10-24 13:33:01 +03:00 
			
		
		
		
	html: Parse tag and attribute names according to HTML5
HTML5 allows bascially all characters in tag and attribute names.
This commit is contained in:
		
							
								
								
									
										70
									
								
								HTMLparser.c
									
									
									
									
									
								
							
							
						
						
									
										70
									
								
								HTMLparser.c
									
									
									
									
									
								
							| @@ -2441,33 +2441,36 @@ htmlSkipBogusComment(htmlParserCtxtPtr ctxt) { | |||||||
|  */ |  */ | ||||||
|  |  | ||||||
| static const xmlChar * | static const xmlChar * | ||||||
| htmlParseHTMLName(htmlParserCtxtPtr ctxt) { | htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) { | ||||||
|     const xmlChar *ret; |     const xmlChar *ret; | ||||||
|     int i = 0; |     int nbchar = 0; | ||||||
|     xmlChar loc[HTML_PARSER_BUFFER_SIZE]; |     int c, l; | ||||||
|  |     int stop = attr ? '=' : 0; | ||||||
|  |     xmlChar buf[HTML_PARSER_BUFFER_SIZE]; | ||||||
|  |  | ||||||
|     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && |     c = CUR_CHAR(l); | ||||||
|         (CUR != ':') && (CUR != '.')) return(NULL); |     while ((c != 0) && (c != '/') && (c != '>') && | ||||||
|  |            ((nbchar == 0) || (c != stop)) && | ||||||
|  |            (!IS_BLANK_CH(c))) { | ||||||
|  |         if (nbchar + l <= HTML_PARSER_BUFFER_SIZE) { | ||||||
|  |             if ((c >= 'A') && (c <= 'Z'))  { | ||||||
|  |                 buf[nbchar++] = c + 0x20; | ||||||
|  |             } else { | ||||||
|  | 	        COPY_BUF(buf, nbchar, c); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|     while ((i < HTML_PARSER_BUFFER_SIZE) && | 	NEXTL(l); | ||||||
|            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || |         c = CUR_CHAR(l); | ||||||
| 	   (CUR == ':') || (CUR == '-') || (CUR == '_') || |  | ||||||
|            (CUR == '.'))) { |  | ||||||
| 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; |  | ||||||
|         else loc[i] = CUR; |  | ||||||
| 	i++; |  | ||||||
|  |  | ||||||
| 	NEXT; |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     ret = xmlDictLookup(ctxt->dict, loc, i); |     ret = xmlDictLookup(ctxt->dict, buf, nbchar); | ||||||
|     if (ret == NULL) |     if (ret == NULL) | ||||||
|         htmlErrMemory(ctxt); |         htmlErrMemory(ctxt); | ||||||
|  |  | ||||||
|     return(ret); |     return(ret); | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| /** | /** | ||||||
|  * htmlParseHTMLName_nonInvasive: |  * htmlParseHTMLName_nonInvasive: | ||||||
|  * @ctxt:  an HTML parser context |  * @ctxt:  an HTML parser context | ||||||
| @@ -2481,22 +2484,31 @@ htmlParseHTMLName(htmlParserCtxtPtr ctxt) { | |||||||
|  |  | ||||||
| static const xmlChar * | static const xmlChar * | ||||||
| htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { | htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { | ||||||
|  |     int nbchar = 0; | ||||||
|     int i = 0; |     int i = 0; | ||||||
|     xmlChar loc[HTML_PARSER_BUFFER_SIZE]; |     int c, l; | ||||||
|  |     xmlChar buf[HTML_PARSER_BUFFER_SIZE]; | ||||||
|     const xmlChar *ret; |     const xmlChar *ret; | ||||||
|  |     size_t avail = ctxt->input->end - ctxt->input->cur; | ||||||
|  |  | ||||||
|     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && |     l = avail - i; | ||||||
|         (NXT(1) != ':')) return(NULL); |     c = xmlGetUTF8Char(CUR_PTR + i, &l); | ||||||
|  |     while ((c > 0) && (c != '/') && (c != '>') && | ||||||
|  |            (!IS_BLANK_CH(c))) { | ||||||
|  |         if (nbchar + l <= HTML_PARSER_BUFFER_SIZE) { | ||||||
|  |             if ((c >= 'A') && (c <= 'Z'))  { | ||||||
|  |                 buf[nbchar++] = c + 0x20; | ||||||
|  |             } else { | ||||||
|  | 	        COPY_BUF(buf, nbchar, c); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|     while ((i < HTML_PARSER_BUFFER_SIZE) && | 	i += l; | ||||||
|            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || |         l = avail - i; | ||||||
| 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { |         c = xmlGetUTF8Char(CUR_PTR + i, &l); | ||||||
| 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; |  | ||||||
|         else loc[i] = NXT(1+i); |  | ||||||
| 	i++; |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     ret = xmlDictLookup(ctxt->dict, loc, i); |     ret = xmlDictLookup(ctxt->dict, buf, nbchar); | ||||||
|     if (ret == NULL) |     if (ret == NULL) | ||||||
|         htmlErrMemory(ctxt); |         htmlErrMemory(ctxt); | ||||||
|  |  | ||||||
| @@ -3636,7 +3648,7 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { | |||||||
|     xmlChar *val = NULL; |     xmlChar *val = NULL; | ||||||
|  |  | ||||||
|     *value = NULL; |     *value = NULL; | ||||||
|     name = htmlParseHTMLName(ctxt); |     name = htmlParseHTMLName(ctxt, 1); | ||||||
|     if (name == NULL) { |     if (name == NULL) { | ||||||
| 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, | 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, | ||||||
| 	             "error parsing attribute name\n", NULL, NULL); | 	             "error parsing attribute name\n", NULL, NULL); | ||||||
| @@ -3777,7 +3789,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { | |||||||
|     maxatts = ctxt->maxatts; |     maxatts = ctxt->maxatts; | ||||||
|  |  | ||||||
|     GROW; |     GROW; | ||||||
|     name = htmlParseHTMLName(ctxt); |     name = htmlParseHTMLName(ctxt, 0); | ||||||
|     if (name == NULL) { |     if (name == NULL) { | ||||||
| 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, | 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, | ||||||
| 	             "htmlParseStartTag: invalid element name\n", | 	             "htmlParseStartTag: invalid element name\n", | ||||||
| @@ -3970,7 +3982,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) | |||||||
|     } |     } | ||||||
|     SKIP(2); |     SKIP(2); | ||||||
|  |  | ||||||
|     name = htmlParseHTMLName(ctxt); |     name = htmlParseHTMLName(ctxt, 0); | ||||||
|     if (name == NULL) |     if (name == NULL) | ||||||
|         return (0); |         return (0); | ||||||
|     /* |     /* | ||||||
|   | |||||||
| @@ -97,7 +97,7 @@ eval("page" + id + " = window.open(URL, '" + id + "', 'toolbars=0, scrollbars=0, | |||||||
|                         </p> |                         </p> | ||||||
| </td> | </td> | ||||||
|                       <td valign="top" width="15%"> |                       <td valign="top" width="15%"> | ||||||
|                         <p align="center"><a href="http://www.gentus.com/"><img align="bottom" alt="Taking a first look at the Abit Linux release called " border="0" height="45" src="doc3_files/gentusbox.gif" width="70" gentus></a><br><a href="http://www.gentus.com/"><font color="white" face="Verdana" size="1">Gentus</font></a>  |                         <p align="center"><a href="http://www.gentus.com/"><img align="bottom" alt="Taking a first look at the Abit Linux release called " border="0" height="45" src="doc3_files/gentusbox.gif" width="70" gentus?.?></a><br><a href="http://www.gentus.com/"><font color="white" face="Verdana" size="1">Gentus</font></a>  | ||||||
|                   </p> |                   </p> | ||||||
| </td> | </td> | ||||||
| </tr> | </tr> | ||||||
|   | |||||||
| @@ -10,9 +10,6 @@ _top"><img src="http://ads.gamesquad.net/addclick.exe/adcycle.cgi?group=52&media | |||||||
| ./test/HTML/doc3.htm:52: HTML parser error : htmlParseEntityRef: expecting ';' | ./test/HTML/doc3.htm:52: HTML parser error : htmlParseEntityRef: expecting ';' | ||||||
| ><img src="http://ads.gamesquad.net/addclick.exe/adcycle.cgi?group=52&media=1&id | ><img src="http://ads.gamesquad.net/addclick.exe/adcycle.cgi?group=52&media=1&id | ||||||
|                                                                                ^ |                                                                                ^ | ||||||
| ./test/HTML/doc3.htm:145: HTML parser error : error parsing attribute name |  | ||||||
|                         width=70 Gentus?.?></A><BR><A  |  | ||||||
|                                        ^ |  | ||||||
| ./test/HTML/doc3.htm:148: HTML parser error : Unexpected end tag : p | ./test/HTML/doc3.htm:148: HTML parser error : Unexpected end tag : p | ||||||
|   </P></TD></TR></TBODY></TABLE></CENTER></TD></TR></TBODY></TABLE></CENTER></P> |   </P></TD></TR></TBODY></TABLE></CENTER></TD></TR></TBODY></TABLE></CENTER></P> | ||||||
|                                                                                ^ |                                                                                ^ | ||||||
| @@ -43,10 +40,10 @@ om/ad_static.asp?pid=2097&sid=1881&asid=7708"></a></IFRAME></CENTER></LI></FONT> | |||||||
| ./test/HTML/doc3.htm:795: HTML parser error : Unexpected end tag : iframe | ./test/HTML/doc3.htm:795: HTML parser error : Unexpected end tag : iframe | ||||||
| 							document.write("42DF8478957377></IFRAME>"); | 							document.write("42DF8478957377></IFRAME>"); | ||||||
| 							                                        ^ | 							                                        ^ | ||||||
| ./test/HTML/doc3.htm:803: HTML parser error : End tag : expected '>' | ./test/HTML/doc3.htm:804: HTML parser error : End tag : expected '>' | ||||||
| 							document.write("DF8478957377></SC"); | 							document.write("RIPT>"); | ||||||
| 							                                 ^ | 							^ | ||||||
| ./test/HTML/doc3.htm:804: HTML parser error : Unexpected end tag : sc | ./test/HTML/doc3.htm:804: HTML parser error : Unexpected end tag : sc"); | ||||||
| 							document.write("RIPT>"); | 							document.write("RIPT>"); | ||||||
| 							                     ^ | 							                     ^ | ||||||
| ./test/HTML/doc3.htm:811: HTML parser error : Unexpected end tag : a | ./test/HTML/doc3.htm:811: HTML parser error : Unexpected end tag : a | ||||||
|   | |||||||
| @@ -319,8 +319,7 @@ SAX.characters( | |||||||
|                         , 26) |                         , 26) | ||||||
| SAX.startElement(p, align='center') | SAX.startElement(p, align='center') | ||||||
| SAX.startElement(a, href='http://www.gentus.com/') | SAX.startElement(a, href='http://www.gentus.com/') | ||||||
| SAX.error: error parsing attribute name | SAX.startElement(img, align='bottom', alt='Taking a first look at the Abit Linux release called ', border='0', height='45', src='doc3_files/gentusbox.gif', width='70', gentus?.?) | ||||||
| SAX.startElement(img, align='bottom', alt='Taking a first look at the Abit Linux release called ', border='0', height='45', src='doc3_files/gentusbox.gif', width='70', gentus) |  | ||||||
| SAX.endElement(img) | SAX.endElement(img) | ||||||
| SAX.endElement(a) | SAX.endElement(a) | ||||||
| SAX.startElement(br) | SAX.startElement(br) | ||||||
| @@ -2698,7 +2697,7 @@ SAX.error: Unexpected end tag : iframe | |||||||
| 							if ((, 532) | 							if ((, 532) | ||||||
| SAX.error: Unexpected end tag : iframe | SAX.error: Unexpected end tag : iframe | ||||||
| SAX.cdata("); | SAX.cdata("); | ||||||
| 							} else if ((parseI, 463) | 							} else if ((parseI, 463) | ||||||
| SAX.error: End tag : expected '>' | SAX.error: End tag : expected '>' | ||||||
| SAX.error: Unexpected end tag : sc"); | SAX.error: Unexpected end tag : sc"); | ||||||
| SAX.cdata("); | SAX.cdata("); | ||||||
|   | |||||||
| @@ -462,13 +462,13 @@ or <a href="/news/pointcast/0,1366,,00.html">PointCast</a></font><br> | |||||||
|  |  | ||||||
| <font size="2" face="Arial,Helvetica, sans-serif"><b><a href="/news/school/0,1383,,00.html">Making the Grade</a></b></font><br><font size="2" face="Arial, Helvetica, sans-serif"><font size="1" face="Arial, Geneva, sans-serif" color="#000000">Reading, writing, and ROM.  <br><i>Sponsored by <a href="http://r.hotwired.com/r/wn_sch_r_nav_uop/http://ads25.focalink.com/SmartBanner/page?12630.53" style="text-decoration:none"><font color="#000000">U of Phoenix</font></a></i></font><br><br>  | <font size="2" face="Arial,Helvetica, sans-serif"><b><a href="/news/school/0,1383,,00.html">Making the Grade</a></b></font><br><font size="2" face="Arial, Helvetica, sans-serif"><font size="1" face="Arial, Geneva, sans-serif" color="#000000">Reading, writing, and ROM.  <br><i>Sponsored by <a href="http://r.hotwired.com/r/wn_sch_r_nav_uop/http://ads25.focalink.com/SmartBanner/page?12630.53" style="text-decoration:none"><font color="#000000">U of Phoenix</font></a></i></font><br><br>  | ||||||
|  |  | ||||||
| <font size="2" face="Arial,Helvetica, sans-serif"><b><a href="/news/infostructure/0,1377,,00.html">Infostructure</a></b></font><br><font size="1" face="Arial, Helvetica, sans-serif" color="#000000">An IS/IT resource <br><i>Sponsored by <a href="http://r.wired.com/r/wn_is_r_ssec/http://ad.doubleclick.net/clk;653163;3599571;s?http://www.sprintbiz.com/s%0Aervlet/appservlet?from=/wired/sprint/&template=/security/security.html&SITE=%0Awired.com&BANNER=Sprint" style="text-decoration:none"><font color="#000000">Sprint</font></a></i></font><br><br>  | <font size="2" face="Arial,Helvetica, sans-serif"><b><a href="/news/infostructure/0,1377,,00.html">Infostructure</a></b></font><br><font size="1" face="Arial, Helvetica, sans-serif" color="#000000">An IS/IT resource <br><i>Sponsored by <a href="http://r.wired.com/r/wn_is_r_ssec/http://ad.doubleclick.net/clk;653163;3599571;s?http://www.sprintbiz.com/s%0Aervlet/appservlet?from=/wired/sprint/&template=/security/security.html&SITE=%0Awired.com&BANNER=Sprint" style="text-decoration:none"><font color="#000000">Sprint</font></a></i><br><br>  | ||||||
|  |  | ||||||
| <font size="2" face="Arial,Helvetica, sans-serif"><b><a href="/news/y2k/0,1360,,00.html">Y2K Watch</a></b></font><br><font size="2" face="Arial, Helvetica, sans-serif"><font size="1" face="Arial, Geneva, sans-serif" color="#000000">Tick... Tick... Tick...</font><br><br>  | <font size="2" face="Arial,Helvetica, sans-serif"><b><a href="/news/y2k/0,1360,,00.html">Y2K Watch</a></b></font><br><font size="2" face="Arial, Helvetica, sans-serif"><font size="1" face="Arial, Geneva, sans-serif" color="#000000">Tick... Tick... Tick...</font><br><br>  | ||||||
|  |  | ||||||
| <font face="Arial, Helvetica, sans-serif" size="2"><b><i><a href="/news/special_reports/1,1293,,00.html">More Hoo-Ha</a></i></b></font><br> <br> | <font face="Arial, Helvetica, sans-serif" size="2"><b><i><a href="/news/special_reports/1,1293,,00.html">More Hoo-Ha</a></i></b></font><br> <br> | ||||||
|  |  | ||||||
| </font></font></font></font></font></font></font></font></font> | </font></font></font></font></font></font></font></font></font></font> | ||||||
| </td> | </td> | ||||||
| </tr> | </tr> | ||||||
| <!-- start of Gen News --> | <!-- start of Gen News --> | ||||||
|   | |||||||
| @@ -218,8 +218,14 @@ wired.com&BANNER=Sprint" style="text-decoration:none"><font color="#000000">Spri | |||||||
| com&BANNER=Sprint" style="text-decoration:none"><font color="#000000">Sprint</a> | com&BANNER=Sprint" style="text-decoration:none"><font color="#000000">Sprint</a> | ||||||
|                                                                                ^ |                                                                                ^ | ||||||
| ./test/HTML/wired.html:408: HTML parser error : End tag : expected '>' | ./test/HTML/wired.html:408: HTML parser error : End tag : expected '>' | ||||||
| =Sprint" style="text-decoration:none"><font color="#000000">Sprint</a></i></font | Sprint" style="text-decoration:none"><font color="#000000">Sprint</a></i></font< | ||||||
|                                                                                ^ |                                                                                ^ | ||||||
|  | ./test/HTML/wired.html:408: HTML parser error : Unexpected end tag : font< | ||||||
|  | " style="text-decoration:none"><font color="#000000">Sprint</a></i></font</font> | ||||||
|  |                                                                                ^ | ||||||
|  | ./test/HTML/wired.html:414: HTML parser error : Opening and ending tag mismatch: td and font | ||||||
|  | </td> | ||||||
|  |      ^ | ||||||
| ./test/HTML/wired.html:414: HTML parser error : Opening and ending tag mismatch: td and font | ./test/HTML/wired.html:414: HTML parser error : Opening and ending tag mismatch: td and font | ||||||
| </td> | </td> | ||||||
|      ^ |      ^ | ||||||
|   | |||||||
| @@ -1961,7 +1961,7 @@ SAX.endElement(font) | |||||||
| SAX.endElement(a) | SAX.endElement(a) | ||||||
| SAX.endElement(i) | SAX.endElement(i) | ||||||
| SAX.error: End tag : expected '>' | SAX.error: End tag : expected '>' | ||||||
| SAX.endElement(font) | SAX.error: Unexpected end tag : font< | ||||||
| SAX.startElement(br) | SAX.startElement(br) | ||||||
| SAX.endElement(br) | SAX.endElement(br) | ||||||
| SAX.startElement(br) | SAX.startElement(br) | ||||||
| @@ -2024,6 +2024,8 @@ SAX.error: Opening and ending tag mismatch: td and font | |||||||
| SAX.endElement(font) | SAX.endElement(font) | ||||||
| SAX.error: Opening and ending tag mismatch: td and font | SAX.error: Opening and ending tag mismatch: td and font | ||||||
| SAX.endElement(font) | SAX.endElement(font) | ||||||
|  | SAX.error: Opening and ending tag mismatch: td and font | ||||||
|  | SAX.endElement(font) | ||||||
| SAX.endElement(td) | SAX.endElement(td) | ||||||
| SAX.characters( | SAX.characters( | ||||||
| , 1) | , 1) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user