1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-21 14:53:44 +03:00
Files
libxml2/codegen/genHtml5LibTests.py
Nick Wellnhofer 258d870629 codegen: Consolidate tools for code generation
Move tools, source files and output tables into codegen directory.

Rename some files.

Adjust tools to match modified files. Remove generation date and source
files from output.

Distribute all tools and sources.
2025-05-16 18:03:12 +02:00

94 lines
2.8 KiB
Python
Executable File

#!/usr/bin/env python3
import glob
import json
import re
state_map = {
'Data state': 0,
'RCDATA state': 1,
'RAWTEXT state': 2,
'PLAINTEXT state': 3,
'Script data state': 4,
'CDATA section state': 5,
}
for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
match = re.search('/([^/]*).test$', filename)
if match is None:
continue
testname = match[1]
if testname == 'xmlViolation':
continue
with open(filename) as json_data:
root = json.load(json_data)
test_out = open(f'test/html-tokenizer/{testname}.test', 'w')
result_out = open(f'result/html-tokenizer/{testname}.test', 'w')
counter = 0
for tests in root.values():
for test in tests:
input = test['input']
# Skip surrogate tests
if re.search(r'\\uD[89A-F]', input, re.I):
continue
input = re.sub(r'\\u([A-Fa-f0-9]{4})',
lambda m: chr(int(m[1], 16)),
input)
output = ''
for token in test['output']:
if token[1] == '\0':
continue
output += token[0] + '\n'
if token[0] == 'DOCTYPE':
for i in range(1, 4):
if token[i] is None:
output += '<none>\n'
else:
output += token[i] + '\n'
else:
output += token[1]
if token[0] == 'StartTag':
for name, value in token[2].items():
output += f' {name}={value}'
output += '\n'
output = re.sub(r'\\u([A-Fa-f0-9]{4})',
lambda m: chr(int(m[1], 16)),
output)
# The HTML5 spec splits handling of U+0000 across
# tokenizer and tree builder. We already ignore
# U+0000 in body text when tokenizing.
output = re.sub(r'\x00', '', output)
for state in test.get('initialStates', ['Data state']):
state_no = state_map.get(state)
if state_no is None:
raise Exception(f'{filename}: unknown state: {state}')
if state_no == 5:
continue
start_tag = test.get('lastStartTag', '-')
test_out.write(f'{counter} {start_tag} {state_no} '
f'{len(input.encode())}\n')
test_out.write(input)
test_out.write('\n')
result_out.write(f'{counter}\n')
result_out.write(output)
counter += 1
test_out.close()
result_out.close()