mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-21 14:53:44 +03:00
Move tools, source files and output tables into codegen directory. Rename some files. Adjust tools to match modified files. Remove generation date and source files from output. Distribute all tools and sources.
94 lines
2.8 KiB
Python
Executable File
94 lines
2.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import glob
|
|
import json
|
|
import re
|
|
|
|
state_map = {
|
|
'Data state': 0,
|
|
'RCDATA state': 1,
|
|
'RAWTEXT state': 2,
|
|
'PLAINTEXT state': 3,
|
|
'Script data state': 4,
|
|
'CDATA section state': 5,
|
|
}
|
|
|
|
for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
|
|
match = re.search('/([^/]*).test$', filename)
|
|
if match is None:
|
|
continue
|
|
testname = match[1]
|
|
if testname == 'xmlViolation':
|
|
continue
|
|
|
|
with open(filename) as json_data:
|
|
root = json.load(json_data)
|
|
|
|
test_out = open(f'test/html-tokenizer/{testname}.test', 'w')
|
|
result_out = open(f'result/html-tokenizer/{testname}.test', 'w')
|
|
|
|
counter = 0
|
|
|
|
for tests in root.values():
|
|
for test in tests:
|
|
input = test['input']
|
|
|
|
# Skip surrogate tests
|
|
if re.search(r'\\uD[89A-F]', input, re.I):
|
|
continue
|
|
|
|
input = re.sub(r'\\u([A-Fa-f0-9]{4})',
|
|
lambda m: chr(int(m[1], 16)),
|
|
input)
|
|
|
|
output = ''
|
|
for token in test['output']:
|
|
if token[1] == '\0':
|
|
continue
|
|
|
|
output += token[0] + '\n'
|
|
|
|
if token[0] == 'DOCTYPE':
|
|
for i in range(1, 4):
|
|
if token[i] is None:
|
|
output += '<none>\n'
|
|
else:
|
|
output += token[i] + '\n'
|
|
else:
|
|
output += token[1]
|
|
if token[0] == 'StartTag':
|
|
for name, value in token[2].items():
|
|
output += f' {name}={value}'
|
|
output += '\n'
|
|
|
|
output = re.sub(r'\\u([A-Fa-f0-9]{4})',
|
|
lambda m: chr(int(m[1], 16)),
|
|
output)
|
|
|
|
# The HTML5 spec splits handling of U+0000 across
|
|
# tokenizer and tree builder. We already ignore
|
|
# U+0000 in body text when tokenizing.
|
|
output = re.sub(r'\x00', '', output)
|
|
|
|
for state in test.get('initialStates', ['Data state']):
|
|
state_no = state_map.get(state)
|
|
if state_no is None:
|
|
raise Exception(f'{filename}: unknown state: {state}')
|
|
if state_no == 5:
|
|
continue
|
|
|
|
start_tag = test.get('lastStartTag', '-')
|
|
|
|
test_out.write(f'{counter} {start_tag} {state_no} '
|
|
f'{len(input.encode())}\n')
|
|
test_out.write(input)
|
|
test_out.write('\n')
|
|
|
|
result_out.write(f'{counter}\n')
|
|
result_out.write(output)
|
|
|
|
counter += 1
|
|
|
|
test_out.close()
|
|
result_out.close()
|