mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-24 13:33:01 +03:00
html: Test tokenizer against html5lib test suite
This commit is contained in:
86
tools/genHtml5LibTests.py
Normal file
86
tools/genHtml5LibTests.py
Normal file
@@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import glob
|
||||
import json
|
||||
import re
|
||||
|
||||
state_map = {
|
||||
'Data state': 0,
|
||||
'RCDATA state': 1,
|
||||
'RAWTEXT state': 2,
|
||||
'PLAINTEXT state': 3,
|
||||
'Script data state': 4,
|
||||
'CDATA section state': 5,
|
||||
}
|
||||
|
||||
for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
|
||||
match = re.search('/([^/]*).test$', filename)
|
||||
if match is None:
|
||||
continue
|
||||
testname = match[1]
|
||||
if testname == 'xmlViolation':
|
||||
continue
|
||||
|
||||
with open(filename) as json_data:
|
||||
root = json.load(json_data)
|
||||
|
||||
test_out = open(f'test/html-tokenizer/{testname}.test', 'w')
|
||||
result_out = open(f'result/html-tokenizer/{testname}.test', 'w')
|
||||
|
||||
counter = 0
|
||||
|
||||
for tests in root.values():
|
||||
for test in tests:
|
||||
input = test['input']
|
||||
|
||||
# Skip surrogate tests
|
||||
if re.search(r'\\uD[89A-F]', input, re.I):
|
||||
continue
|
||||
|
||||
input = re.sub(r'\\u([A-Fa-f0-9]{4})',
|
||||
lambda m: chr(int(m[1], 16)),
|
||||
input)
|
||||
|
||||
output = ''
|
||||
for token in test['output']:
|
||||
output += token[0] + '\n'
|
||||
|
||||
if token[0] == 'DOCTYPE':
|
||||
for i in range(1, 4):
|
||||
if token[i] is None:
|
||||
output += '<none>\n'
|
||||
else:
|
||||
output += token[i] + '\n'
|
||||
else:
|
||||
output += token[1]
|
||||
if token[0] == 'StartTag':
|
||||
for name, value in token[2].items():
|
||||
output += f' {name}={value}'
|
||||
output += '\n'
|
||||
|
||||
output = re.sub(r'\\u([A-Fa-f0-9]{4})',
|
||||
lambda m: chr(int(m[1], 16)),
|
||||
output)
|
||||
output = re.sub(r'\x00', '\uFFFD', output)
|
||||
|
||||
for state in test.get('initialStates', ['Data state']):
|
||||
state_no = state_map.get(state)
|
||||
if state_no is None:
|
||||
raise Exception(f'{filename}: unknown state: {state}')
|
||||
if state_no == 5:
|
||||
continue
|
||||
|
||||
start_tag = test.get('lastStartTag', '-')
|
||||
|
||||
test_out.write(f'{counter} {start_tag} {state_no} '
|
||||
f'{len(input.encode())}\n')
|
||||
test_out.write(input)
|
||||
test_out.write('\n')
|
||||
|
||||
result_out.write(f'{counter}\n')
|
||||
result_out.write(output)
|
||||
|
||||
counter += 1
|
||||
|
||||
test_out.close()
|
||||
result_out.close()
|
||||
Reference in New Issue
Block a user