1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-21 14:53:44 +03:00
Files
libxml2/codegen/genHtml5Ent.py
Nick Wellnhofer 258d870629 codegen: Consolidate tools for code generation
Move tools, source files and output tables into codegen directory.

Rename some files.

Adjust tools to match modified files. Remove generation date and source
files from output.

Distribute all tools and sources.
2025-05-16 18:03:12 +02:00

171 lines
4.6 KiB
Python
Executable File

#!/usr/bin/env python3
import json
import sys
from dataclasses import dataclass
# The basic idea is to find named character references using binary
# search. Since entity strings may not have a terminator, this doesn't
# work if one entity string is a prefix of another. In this case,
# we branch to a subtable after matching the prefix.
#
# We create separate initial tables based on the first character
# of the entity name.
#
# The following tables are generated:
#
# htmlEntAlpha: start and end of initial tables, indexing into
# htmlEntValues
# htmlEntValues: concatenation of all table values, which index into
# htmlEntStrings
# htmlEntStrings: variable sized records containing entity name,
# replacement and optionally the position of a
# subtable
try:
with open('entities.json') as json_data:
ents = json.load(json_data)
except FileNotFoundError:
print('entities.json not found, try curl -LJO',
'https://html.spec.whatwg.org/entities.json')
sys.exit(1)
def to_cchars(s):
r = []
for c in s.encode():
if c >= 0x20 and c <= 0x7E and c != ord("'") and c != ord('\\'):
v = f"'{chr(c)}'"
else:
v = c
r += [ v ]
return r
@dataclass
class PrefixStackEntry:
prefix: str
table_id: int
@dataclass
class AlphaFixup:
table_id: int
char: int
@dataclass
class StringFixup:
table_id: int
string_index: int
super_table_id: int
super_offset: int
# Remove entity strings without trailing semicolon
keys = (key for key in ents.keys() if key.endswith(';'))
# Sort entity strings
keys = sorted(keys, key=lambda k: k[1:-1])
strings = []
tables = []
prefix_stack = []
alpha_fixups = []
string_fixups = []
for i in range(64):
tables.append([])
for i, key in enumerate(keys):
name = key[1:-1]
next_name = None
if i + 1 < len(keys):
next_name = keys[i+1][1:-1]
while prefix_stack and not name.startswith(prefix_stack[-1].prefix):
prefix_stack.pop()
# First character is initial prefix
if not prefix_stack:
table_id = len(tables)
tables.append([])
prefix_stack.append(PrefixStackEntry(name[0], table_id))
alpha_fixups.append(AlphaFixup(table_id, ord(name[0]) % 64))
string_index = len(strings)
table = tables[prefix_stack[-1].table_id]
table_index = len(table)
table.append(string_index)
name_offset = len(prefix_stack[-1].prefix)
name_chars = to_cchars(name[name_offset:])
repl_chars = to_cchars(ents[key]['characters'])
semicolon_flag = 0
if key[:-1] in ents:
semicolon_flag = 0x80
if next_name and next_name.startswith(name):
# Create subtable
strings += [
len(name_chars) | semicolon_flag | 0x40, *name_chars,
0, 0, # subtable position, to be fixed up
len(repl_chars), *repl_chars,
]
table_id = len(tables)
tables.append([])
fixup_index = string_index + 1 + len(name_chars)
string_fixups.append(StringFixup(
table_id, fixup_index, prefix_stack[-1].table_id, table_index,
))
prefix_stack.append(PrefixStackEntry(name, table_id))
else:
strings += [
len(name_chars) | semicolon_flag, *name_chars,
len(repl_chars), *repl_chars,
]
# Concat tables and record ranges
ranges = [ 0 ]
values = []
for table in tables:
values += table
ranges.append(len(values))
# Create alpha table
alpha = [ 0 ] * (59 * 3)
for fixup in alpha_fixups:
table_id, c = fixup.table_id, fixup.char
start = ranges[table_id]
end = ranges[table_id+1]
alpha[c*3:c*3+3] = [ start & 0xFF, start >> 8, end - start ]
# Fix up subtable positions
for fixup in string_fixups:
table_id, i = fixup.table_id, fixup.string_index
start = ranges[table_id]
end = ranges[table_id+1]
super_index = ranges[fixup.super_table_id] + fixup.super_offset
strings[i:i+2] = [ start - super_index, end - start ]
# Print tables
def gen_table(ctype, cname, values, fmt, elems_per_line):
count = len(values)
r = ''
for i in range(count):
if i != 0: r += ','
if i % elems_per_line == 0: r += '\n '
else: r += ' '
r += fmt % values[i]
return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n\n'
with open('codegen/html5ent.inc', 'w') as out:
out.write(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15))
out.write(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10))
out.write(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15))