html: Parse named character references according to HTML5

2025-10-24 13:33:01 +03:00 · 2024-09-03 15:52:44 +02:00
parent d5cd0f07f8
commit 5951179239
25 changed files with 23265 additions and 432 deletions
--- a/tools/genHtmlEnt.py
+++ b/tools/genHtmlEnt.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+
+import json
+import sys
+from dataclasses import dataclass
+
+# The basic idea is to find named character references using binary
+# search. Since entity strings may not have a terminator, this doesn't
+# work if one entity string is a prefix of another. In this case,
+# we branch to a subtable after matching the prefix.
+#
+# We create separate initial tables based on the first character
+# of the entity name.
+#
+# The following tables are generated:
+#
+# htmlEntAlpha:   start and end of initial tables, indexing into
+#                 htmlEntValues
+# htmlEntValues:  concatenation of all table values, which index into
+#                 htmlEntStrings
+# htmlEntStrings: variable sized records containing entity name,
+#                 replacement and optionally the position of a
+#                 subtable
+
+try:
+    with open('entities.json') as json_data:
+        ents = json.load(json_data)
+except FileNotFoundError:
+    print('entities.json not found, try curl -LJO',
+          'https://html.spec.whatwg.org/entities.json')
+    sys.exit(1)
+
+def to_cchars(s):
+    r = []
+
+    for c in s.encode():
+        if c >= 0x20 and c <= 0x7E and c != ord("'") and c != ord('\\'):
+            v = f"'{chr(c)}'"
+        else:
+            v = c
+        r += [ v ]
+
+    return r
+
+@dataclass
+class PrefixStackEntry:
+    prefix: str
+    table_id: int
+
+@dataclass
+class AlphaFixup:
+    table_id: int
+    char: int
+
+@dataclass
+class StringFixup:
+    table_id: int
+    string_index: int
+    super_table_id: int
+    super_offset: int
+
+# Remove entity strings without trailing semicolon
+keys = (key for key in ents.keys() if key.endswith(';'))
+
+# Sort entity strings
+keys = sorted(keys, key=lambda k: k[1:-1])
+
+strings = []
+tables = []
+prefix_stack = []
+alpha_fixups = []
+string_fixups = []
+for i in range(64):
+    tables.append([])
+
+for i, key in enumerate(keys):
+    name = key[1:-1]
+
+    next_name = None
+    if i + 1 < len(keys):
+        next_name = keys[i+1][1:-1]
+
+    while prefix_stack and not name.startswith(prefix_stack[-1].prefix):
+        prefix_stack.pop()
+
+    # First character is initial prefix
+    if not prefix_stack:
+        table_id = len(tables)
+        tables.append([])
+
+        prefix_stack.append(PrefixStackEntry(name[0], table_id))
+        alpha_fixups.append(AlphaFixup(table_id, ord(name[0]) % 64))
+
+    string_index = len(strings)
+    table = tables[prefix_stack[-1].table_id]
+    table_index = len(table)
+    table.append(string_index)
+
+    name_offset = len(prefix_stack[-1].prefix)
+    name_chars = to_cchars(name[name_offset:])
+    repl_chars = to_cchars(ents[key]['characters'])
+    semicolon_flag = 0
+    if key[:-1] in ents:
+        semicolon_flag = 0x80
+
+    if next_name and next_name.startswith(name):
+        # Create subtable
+
+        strings += [
+            len(name_chars) | semicolon_flag | 0x40, *name_chars,
+            0, 0, # subtable position, to be fixed up
+            len(repl_chars), *repl_chars,
+        ]
+
+        table_id = len(tables)
+        tables.append([])
+
+        fixup_index = string_index + 1 + len(name_chars)
+        string_fixups.append(StringFixup(
+            table_id, fixup_index, prefix_stack[-1].table_id, table_index,
+        ))
+
+        prefix_stack.append(PrefixStackEntry(name, table_id))
+    else:
+        strings += [
+            len(name_chars) | semicolon_flag, *name_chars,
+            len(repl_chars), *repl_chars,
+        ]
+
+# Concat tables and record ranges
+ranges = [ 0 ]
+values = []
+for table in tables:
+    values += table
+    ranges.append(len(values))
+
+# Create alpha table
+alpha = [ 0 ] * (59 * 3)
+for fixup in alpha_fixups:
+    table_id, c = fixup.table_id, fixup.char
+    start = ranges[table_id]
+    end = ranges[table_id+1]
+    alpha[c*3:c*3+3] = [ start & 0xFF, start >> 8, end - start ]
+
+# Fix up subtable positions
+for fixup in string_fixups:
+    table_id, i = fixup.table_id, fixup.string_index
+    start = ranges[table_id]
+    end = ranges[table_id+1]
+    super_index = ranges[fixup.super_table_id] + fixup.super_offset
+    strings[i:i+2] = [ start - super_index, end - start ]
+
+# Print tables
+
+def gen_table(ctype, cname, values, fmt, elems_per_line):
+    count = len(values)
+    r = ''
+
+    for i in range(count):
+        if i != 0: r += ','
+        if i % elems_per_line == 0: r += '\n    '
+        else: r += ' '
+        r += fmt % values[i]
+
+    return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n'
+
+print(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15))
+print(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10))
+print(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15))