mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-21 14:53:44 +03:00
Move tools, source files and output tables into codegen directory. Rename some files. Adjust tools to match modified files. Remove generation date and source files from output. Distribute all tools and sources.
171 lines
4.6 KiB
Python
Executable File
171 lines
4.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import json
|
|
import sys
|
|
from dataclasses import dataclass
|
|
|
|
# The basic idea is to find named character references using binary
|
|
# search. Since entity strings may not have a terminator, this doesn't
|
|
# work if one entity string is a prefix of another. In this case,
|
|
# we branch to a subtable after matching the prefix.
|
|
#
|
|
# We create separate initial tables based on the first character
|
|
# of the entity name.
|
|
#
|
|
# The following tables are generated:
|
|
#
|
|
# htmlEntAlpha: start and end of initial tables, indexing into
|
|
# htmlEntValues
|
|
# htmlEntValues: concatenation of all table values, which index into
|
|
# htmlEntStrings
|
|
# htmlEntStrings: variable sized records containing entity name,
|
|
# replacement and optionally the position of a
|
|
# subtable
|
|
|
|
try:
|
|
with open('entities.json') as json_data:
|
|
ents = json.load(json_data)
|
|
except FileNotFoundError:
|
|
print('entities.json not found, try curl -LJO',
|
|
'https://html.spec.whatwg.org/entities.json')
|
|
sys.exit(1)
|
|
|
|
def to_cchars(s):
|
|
r = []
|
|
|
|
for c in s.encode():
|
|
if c >= 0x20 and c <= 0x7E and c != ord("'") and c != ord('\\'):
|
|
v = f"'{chr(c)}'"
|
|
else:
|
|
v = c
|
|
r += [ v ]
|
|
|
|
return r
|
|
|
|
@dataclass
|
|
class PrefixStackEntry:
|
|
prefix: str
|
|
table_id: int
|
|
|
|
@dataclass
|
|
class AlphaFixup:
|
|
table_id: int
|
|
char: int
|
|
|
|
@dataclass
|
|
class StringFixup:
|
|
table_id: int
|
|
string_index: int
|
|
super_table_id: int
|
|
super_offset: int
|
|
|
|
# Remove entity strings without trailing semicolon
|
|
keys = (key for key in ents.keys() if key.endswith(';'))
|
|
|
|
# Sort entity strings
|
|
keys = sorted(keys, key=lambda k: k[1:-1])
|
|
|
|
strings = []
|
|
tables = []
|
|
prefix_stack = []
|
|
alpha_fixups = []
|
|
string_fixups = []
|
|
for i in range(64):
|
|
tables.append([])
|
|
|
|
for i, key in enumerate(keys):
|
|
name = key[1:-1]
|
|
|
|
next_name = None
|
|
if i + 1 < len(keys):
|
|
next_name = keys[i+1][1:-1]
|
|
|
|
while prefix_stack and not name.startswith(prefix_stack[-1].prefix):
|
|
prefix_stack.pop()
|
|
|
|
# First character is initial prefix
|
|
if not prefix_stack:
|
|
table_id = len(tables)
|
|
tables.append([])
|
|
|
|
prefix_stack.append(PrefixStackEntry(name[0], table_id))
|
|
alpha_fixups.append(AlphaFixup(table_id, ord(name[0]) % 64))
|
|
|
|
string_index = len(strings)
|
|
table = tables[prefix_stack[-1].table_id]
|
|
table_index = len(table)
|
|
table.append(string_index)
|
|
|
|
name_offset = len(prefix_stack[-1].prefix)
|
|
name_chars = to_cchars(name[name_offset:])
|
|
repl_chars = to_cchars(ents[key]['characters'])
|
|
semicolon_flag = 0
|
|
if key[:-1] in ents:
|
|
semicolon_flag = 0x80
|
|
|
|
if next_name and next_name.startswith(name):
|
|
# Create subtable
|
|
|
|
strings += [
|
|
len(name_chars) | semicolon_flag | 0x40, *name_chars,
|
|
0, 0, # subtable position, to be fixed up
|
|
len(repl_chars), *repl_chars,
|
|
]
|
|
|
|
table_id = len(tables)
|
|
tables.append([])
|
|
|
|
fixup_index = string_index + 1 + len(name_chars)
|
|
string_fixups.append(StringFixup(
|
|
table_id, fixup_index, prefix_stack[-1].table_id, table_index,
|
|
))
|
|
|
|
prefix_stack.append(PrefixStackEntry(name, table_id))
|
|
else:
|
|
strings += [
|
|
len(name_chars) | semicolon_flag, *name_chars,
|
|
len(repl_chars), *repl_chars,
|
|
]
|
|
|
|
# Concat tables and record ranges
|
|
ranges = [ 0 ]
|
|
values = []
|
|
for table in tables:
|
|
values += table
|
|
ranges.append(len(values))
|
|
|
|
# Create alpha table
|
|
alpha = [ 0 ] * (59 * 3)
|
|
for fixup in alpha_fixups:
|
|
table_id, c = fixup.table_id, fixup.char
|
|
start = ranges[table_id]
|
|
end = ranges[table_id+1]
|
|
alpha[c*3:c*3+3] = [ start & 0xFF, start >> 8, end - start ]
|
|
|
|
# Fix up subtable positions
|
|
for fixup in string_fixups:
|
|
table_id, i = fixup.table_id, fixup.string_index
|
|
start = ranges[table_id]
|
|
end = ranges[table_id+1]
|
|
super_index = ranges[fixup.super_table_id] + fixup.super_offset
|
|
strings[i:i+2] = [ start - super_index, end - start ]
|
|
|
|
# Print tables
|
|
|
|
def gen_table(ctype, cname, values, fmt, elems_per_line):
|
|
count = len(values)
|
|
r = ''
|
|
|
|
for i in range(count):
|
|
if i != 0: r += ','
|
|
if i % elems_per_line == 0: r += '\n '
|
|
else: r += ' '
|
|
r += fmt % values[i]
|
|
|
|
return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n\n'
|
|
|
|
with open('codegen/html5ent.inc', 'w') as out:
|
|
out.write(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15))
|
|
out.write(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10))
|
|
out.write(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15))
|