libxml2/codegen/genHtml5LibTests.py

#!/usr/bin/env python3

import glob
import json
import re

state_map = {
    'Data state':          0,
    'RCDATA state':        1,
    'RAWTEXT state':       2,
    'PLAINTEXT state':     3,
    'Script data state':   4,
    'CDATA section state': 5,
}

for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
    match = re.search('/([^/]*).test$', filename)
    if match is None:
        continue
    testname = match[1]
    if testname == 'xmlViolation':
        continue

    with open(filename) as json_data:
        root = json.load(json_data)

    test_out = open(f'test/html-tokenizer/{testname}.test', 'w')
    result_out = open(f'result/html-tokenizer/{testname}.test', 'w')

    counter = 0

    for tests in root.values():
        for test in tests:
            input = test['input']

            # Skip surrogate tests
            if re.search(r'\\uD[89A-F]', input, re.I):
                continue

            input = re.sub(r'\\u([A-Fa-f0-9]{4})',
                           lambda m: chr(int(m[1], 16)),
                           input)

            output = ''
            for token in test['output']:
                if token[1] == '\0':
                    continue

                output += token[0] + '\n'

                if token[0] == 'DOCTYPE':
                    for i in range(1, 4):
                        if token[i] is None:
                            output += '<none>\n'
                        else:
                            output += token[i] + '\n'
                else:
                    output += token[1]
                    if token[0] == 'StartTag':
                        for name, value in token[2].items():
                            output += f' {name}={value}'
                    output += '\n'

            output = re.sub(r'\\u([A-Fa-f0-9]{4})',
                            lambda m: chr(int(m[1], 16)),
                            output)

            # The HTML5 spec splits handling of U+0000 across
            # tokenizer and tree builder. We already ignore
            # U+0000 in body text when tokenizing.
            output = re.sub(r'\x00', '', output)

            for state in test.get('initialStates', ['Data state']):
                state_no = state_map.get(state)
                if state_no is None:
                    raise Exception(f'{filename}: unknown state: {state}')
                if state_no == 5:
                    continue

                start_tag = test.get('lastStartTag', '-')

                test_out.write(f'{counter} {start_tag} {state_no} '
                               f'{len(input.encode())}\n')
                test_out.write(input)
                test_out.write('\n')

                result_out.write(f'{counter}\n')
                result_out.write(output)

                counter += 1

        test_out.close()
        result_out.close()