mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-21 14:53:44 +03:00
html: Rework meta charset handling
Don't use encoding from meta tags when serializing. Only use the value in `doc->encoding`, matching the XML serializer. This is the actual encoding used when parsing. Stop modifying the input document by setting meta tags before serializing. Meta tags are now injected during serialization. Add full support for <meta charset=""> which is also used when adding meta tags. Align with HTML5 and implement the "algorithm for extracting a character encoding from a meta element". Only modify the encoding substring in Content-Type meta tags. Only switch encoding once when parsing. Fix htmlSaveFileFormat with a NULL encoding not to declare a misleading UTF-8 charset. Fixes #909.
This commit is contained in:
@@ -77,17 +77,14 @@ if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http
|
||||
sys.exit(1)
|
||||
str = doc.serialize("ISO-8859-1")
|
||||
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>
|
||||
<html><head><meta charset="ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>
|
||||
""":
|
||||
print("error serializing HTML document 2")
|
||||
sys.exit(1)
|
||||
str = doc.serialize(format=1)
|
||||
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
||||
<title>Hello</title>
|
||||
</head>
|
||||
<head><title>Hello</title></head>
|
||||
<body><p>hello</p></body>
|
||||
</html>
|
||||
""":
|
||||
@@ -97,13 +94,13 @@ str = doc.serialize("iso-8859-1", 1)
|
||||
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
||||
<meta charset="iso-8859-1">
|
||||
<title>Hello</title>
|
||||
</head>
|
||||
<body><p>hello</p></body>
|
||||
</html>
|
||||
""":
|
||||
print("error serializing HTML document 4")
|
||||
print("error serializing HTML document 4", str)
|
||||
sys.exit(1)
|
||||
|
||||
#
|
||||
@@ -116,15 +113,12 @@ if str != """<html><head><title>Hello</title></head><body><p>hello</p></body></h
|
||||
print("error serializing HTML root 1")
|
||||
sys.exit(1)
|
||||
str = root.serialize("ISO-8859-1")
|
||||
if str != """<html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>""":
|
||||
if str != """<html><head><meta charset="ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>""":
|
||||
print("error serializing HTML root 2")
|
||||
sys.exit(1)
|
||||
str = root.serialize(format=1)
|
||||
if str != """<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
||||
<title>Hello</title>
|
||||
</head>
|
||||
<head><title>Hello</title></head>
|
||||
<body><p>hello</p></body>
|
||||
</html>""":
|
||||
print("error serializing HTML root 3")
|
||||
@@ -132,7 +126,7 @@ if str != """<html>
|
||||
str = root.serialize("iso-8859-1", 1)
|
||||
if str != """<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
||||
<meta charset="iso-8859-1">
|
||||
<title>Hello</title>
|
||||
</head>
|
||||
<body><p>hello</p></body>
|
||||
|
Reference in New Issue
Block a user