From 258d8706291cc8289598e14efc3fdc0ce0f6f575 Mon Sep 17 00:00:00 2001
From: Nick Wellnhofer <wellnhofer@aevum.de>
Date: Thu, 15 May 2025 17:49:49 +0200
Subject: [PATCH] codegen: Consolidate tools for code generation

Move tools, source files and output tables into codegen directory.

Rename some files.

Adjust tools to match modified files. Remove generation date and source
files from output.

Distribute all tools and sources.
---
 HTMLparser.c                                  |  2 +-
 Makefile.am                                   | 17 +++-
 iso8859x.inc => codegen/charset.inc           |  0
 chvalid.def => codegen/chvalid.def            |  0
 codegen/escape.inc                            | 66 +++++++++++++
 {tools => codegen}/genChRanges.py             | 96 +++++++++----------
 .../genTranscode.py => codegen/genCharset.py  |  2 +-
 codegen/genEscape.py                          | 69 +++++++++++++
 tools/genHtmlEnt.py => codegen/genHtml5Ent.py |  9 +-
 {tools => codegen}/genHtml5LibTests.py        |  0
 tools/gentest.py => codegen/genTestApi.py     | 68 ++++++-------
 {tools => codegen}/genUnicode.py              | 48 ++++------
 html5ent.inc => codegen/html5ent.inc          |  0
 {tools => codegen}/xmlmod.py                  |  0
 encoding.c                                    |  2 +-
 python/generator.py                           |  2 +-
 tools/genEscape.py                            | 66 -------------
 xmlIO.c                                       | 71 +-------------
 18 files changed, 256 insertions(+), 262 deletions(-)
 rename iso8859x.inc => codegen/charset.inc (100%)
 rename chvalid.def => codegen/chvalid.def (100%)
 create mode 100644 codegen/escape.inc
 rename {tools => codegen}/genChRanges.py (94%)
 rename tools/genTranscode.py => codegen/genCharset.py (99%)
 create mode 100755 codegen/genEscape.py
 rename tools/genHtmlEnt.py => codegen/genHtml5Ent.py (93%)
 rename {tools => codegen}/genHtml5LibTests.py (100%)
 mode change 100644 => 100755
 rename tools/gentest.py => codegen/genTestApi.py (81%)
 mode change 100644 => 100755
 rename {tools => codegen}/genUnicode.py (90%)
 rename html5ent.inc => codegen/html5ent.inc (100%)
 rename {tools => codegen}/xmlmod.py (100%)
 delete mode 100755 tools/genEscape.py

diff --git a/HTMLparser.c b/HTMLparser.c
index 35e162a4..9a8e2c05 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2432,7 +2432,7 @@ htmlCodePointToUtf8(int c, xmlChar *out, int *osize) {
     return(out);
 }
 
-#include "html5ent.inc"
+#include "codegen/html5ent.inc"
 
 #define ENT_F_SEMICOLON 0x80u
 #define ENT_F_SUBTABLE  0x40u
diff --git a/Makefile.am b/Makefile.am
index ae877617..c07147ea 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -154,7 +154,7 @@ testdso_la_LDFLAGS = $(AM_LDFLAGS) \
 		     -module -no-undefined -avoid-version -rpath $(libdir)
 
 rebuild_testapi:
-	cd $(srcdir) && python3 tools/gentest.py $(abs_builddir)
+	cd $(srcdir) && python3 codegen/genTestApi.py $(abs_builddir)
 
 testapi_SOURCES=testapi.c
 testapi_DEPENDENCIES = $(DEPS)
@@ -201,9 +201,18 @@ CLEANFILES = missing.lst runsuite.log runxmlconf.log test.out \
 
 EXTRA_DIST = Copyright libxml2-config.cmake.in autogen.sh \
 	     libxml.h \
-	     html5ent.inc iso8859x.inc \
-	     tools/gentest.py tools/xmlmod.py \
-	     tools/genChRanges.py tools/genEscape.py tools/genUnicode.py \
+	     codegen/charset.inc \
+	     codegen/chvalid.def \
+	     codegen/escape.inc \
+	     codegen/genCharset.py \
+	     codegen/genChRanges.py \
+	     codegen/genEscape.py \
+	     codegen/genHtml5Ent.py \
+	     codegen/genHtml5LibTests.py \
+	     codegen/genTestApi.py \
+	     codegen/genUnicode.py \
+	     codegen/html5ent.inc \
+	     codegen/xmlmod.py \
 	     timsort.h \
 	     README.zOS README.md \
 	     CMakeLists.txt config.h.cmake.in libxml2-config.cmake.cmake.in \
diff --git a/iso8859x.inc b/codegen/charset.inc
similarity index 100%
rename from iso8859x.inc
rename to codegen/charset.inc
diff --git a/chvalid.def b/codegen/chvalid.def
similarity index 100%
rename from chvalid.def
rename to codegen/chvalid.def
diff --git a/codegen/escape.inc b/codegen/escape.inc
new file mode 100644
index 00000000..a51660af
--- /dev/null
+++ b/codegen/escape.inc
@@ -0,0 +1,66 @@
+static const char xmlEscapeContent[] = {
+      8, '&', '#', 'x', 'F', 'F', 'F', 'D', ';',   4, '&', '#',
+    '9', ';',   5, '&', '#', '1', '0', ';',   5, '&', '#', '1',
+    '3', ';',   6, '&', 'q', 'u', 'o', 't', ';',   5, '&', 'a',
+    'm', 'p', ';',   4, '&', 'l', 't', ';',   4, '&', 'g', 't',
+    ';',
+};
+
+static const signed char xmlEscapeTab[128] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,  0, 20,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    -1, -1, -1, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+static const signed char xmlEscapeTabQuot[128] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,  0, 20,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    -1, -1, 26, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+static const signed char xmlEscapeTabAttr[128] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  9, 14,  0,  0, 20,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    -1, -1, 26, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+#ifdef LIBXML_HTML_ENABLED
+
+static const signed char htmlEscapeTab[128] = {
+     0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+static const signed char htmlEscapeTabAttr[128] = {
+     0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, 26, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+#endif /* LIBXML_HTML_ENABLED */
diff --git a/tools/genChRanges.py b/codegen/genChRanges.py
similarity index 94%
rename from tools/genChRanges.py
rename to codegen/genChRanges.py
index 567b31b4..3e58b70d 100755
--- a/tools/genChRanges.py
+++ b/codegen/genChRanges.py
@@ -16,7 +16,6 @@
 #
 
 import sys
-import time
 
 #
 # A routine to take a list of yes/no (1, 0) values and turn it
@@ -41,8 +40,6 @@ def makeRange(lst):
         pos = e + 1                     # ready to check for next range
     return ret
 
-sources = "chvalid.def"                 # input filename
-
 # minTableSize gives the minimum number of ranges which must be present
 # before a 256-byte lookup table is produced.  If there are less than this
 # number, a macro with inline comparisons is generated
@@ -54,9 +51,9 @@ Functs = {}
 state = 0
 
 try:
-    defines = open("chvalid.def", "r")
+    defines = open("codegen/chvalid.def", "r")
 except:
-    print("Missing chvalid.def, aborting ...")
+    print("Missing codegen/chvalid.def, aborting ...")
     sys.exit(1)
 
 #
@@ -202,19 +199,19 @@ except:
     print("Failed to open chvalid.c")
     sys.exit(1)
 
-date = time.asctime(time.localtime(time.time()))
+fkeys = sorted(Functs.keys())
 
 header.write(
-"""/*
- * Summary: Unicode character range checking
- * Description: this module exports interfaces for the character
+"""/**
+ * @file
+ *
+ * @brief Unicode character range checking
+ *
+ * this module exports interfaces for the character
  *               range validation APIs
  *
  * This file is automatically generated from the cvs source
  * definition files using the genChRanges.py Python script
- *
- * Generation date: %s
- * Sources: %s
  */
 
 #ifndef __XML_CHVALID_H__
@@ -227,6 +224,8 @@ header.write(
 extern "C" {
 #endif
 
+/** @cond ignore */
+
 /*
  * Define our typedefs and structures
  *
@@ -254,13 +253,27 @@ struct _xmlChRangeGroup {
     const xmlChLRange\t*longRange;
 };
 
+""");
+
+for f in fkeys:
+    if len(Functs[f][1]) > 0:
+        header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f)
+    if max(Functs[f][0]) > 0:   # only check if at least one entry
+        rangeTable = makeRange(Functs[f][0])
+        numRanges = len(rangeTable)
+        if numRanges >= minTableSize:   # table is worthwhile
+            header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f)
+
+header.write("""
 /**
  * Range checking routine
  */
 XMLPUBFUN int
 \t\txmlCharInRange(unsigned int val, const xmlChRangeGroup *group);
 
-""" % (date, sources));
+/** @endcond */
+""");
+
 output.write(
 """/*
  * chvalid.c:\tthis module implements the character range
@@ -268,9 +281,6 @@ output.write(
  *
  * This file is automatically generated from the cvs source
  * definition files using the genChRanges.py Python script
- *
- * Generation date: %s
- * Sources: %s
  */
 
 #define IN_LIBXML
@@ -287,7 +297,7 @@ output.write(
  * allowed.
  *
  */
-""" % (date, sources));
+""");
 
 #
 # Now output the generated data.
@@ -298,8 +308,6 @@ output.write(
 # compares, otherwise we output a 256-byte table and a macro to use it.
 #
 
-fkeys = sorted(Functs.keys())
-
 for f in fkeys:
 
 # First we convert the specified single-byte values into a group of ranges.
@@ -310,15 +318,13 @@ for f in fkeys:
         rangeTable = makeRange(Functs[f][0])
         numRanges = len(rangeTable)
         if numRanges >= minTableSize:   # table is worthwhile
-            header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f)
             header.write("""
 /**
- * %s_ch:
- * @c: char to validate
- *
  * Automatically generated by genChRanges.py
+ *
+ * @param c  char to validate
  */
-""" % f)
+""")
             header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))
 
             # write the constant data to the code file
@@ -343,12 +349,11 @@ for f in fkeys:
 
             header.write("""
 /**
- * %s_ch:
- * @c: char to validate
- *
  * Automatically generated by genChRanges.py
+ *
+ * @param c  char to validate
  */
-""" % f)
+""")
             # okay, I'm tired of the messy lineup - let's automate it!
             pline = "#define %s_ch(c)" % f
             # 'ntab' is number of tabs needed to position to col. 33 from name end
@@ -378,12 +383,11 @@ for f in fkeys:
 
     header.write("""
 /**
- * %sQ:
- * @c: char to validate
- *
  * Automatically generated by genChRanges.py
+ *
+ * @param c  char to validate
  */
-""" % f)
+""")
     pline = "#define %sQ(c)" % f
     ntab = 4 - (len(pline)) // 8
     if ntab < 0:
@@ -403,7 +407,7 @@ for f in fkeys:
         header.write(" 0)\n\n")
     else:
         if numRanges >= minTableSize:
-            header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n"  % f)
+            header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n"  % f)
         else:           # if < minTableSize, generate inline code
             firstFlag = 1
             for rg in Functs[f][1]:
@@ -417,14 +421,10 @@ for f in fkeys:
                 else:                           # value range
                     pline += "((0x%x <= (c)) &&" % rg[0]
                     pline += " ((c) <= 0x%x))" % rg[1]
-            pline += "))\n\n"
+            pline += "))\n"
             header.write(pline)
 
 
-    if len(Functs[f][1]) > 0:
-        header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f)
-
-
 #
 # Next we do the unicode ranges
 #
@@ -477,14 +477,12 @@ for f in fkeys:
 output.write(
 """
 /**
- * xmlCharInRange:
- * @val: character to be validated
- * @rptr: pointer to range to be used to validate
- *
  * Does a binary search of the range table to determine if char
  * is valid
  *
- * Returns: true if character valid, false otherwise
+ * @param val  character to be validated
+ * @param rptr  pointer to range to be used to validate
+ * @returns true if character valid, false otherwise
  */
 int
 xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) {
@@ -542,18 +540,16 @@ xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) {
 for f in fkeys:
     output.write("""
 /**
- * %s:
- * @ch:  character to validate
- *
  * This function is DEPRECATED.
-""" % f);
+""");
     if max(Functs[f][0]) > 0:
-        output.write(" * Use %s_ch or %sQ instead" % (f, f))
+        output.write(" * Use %s_ch() or %sQ() instead" % (f, f))
     else:
-        output.write(" * Use %sQ instead" % f)
+        output.write(" * Use %sQ() instead" % f)
     output.write("""
  *
- * Returns true if argument valid, false otherwise
+ * @param ch  character to validate
+ * @returns true if argument valid, false otherwise
  */
 """)
     output.write("int\n%s(unsigned int ch) {\n    return(%sQ(ch));\n}\n\n" % (f,f))
diff --git a/tools/genTranscode.py b/codegen/genCharset.py
similarity index 99%
rename from tools/genTranscode.py
rename to codegen/genCharset.py
index 7a5168ae..7f73c48f 100755
--- a/tools/genTranscode.py
+++ b/codegen/genCharset.py
@@ -73,7 +73,7 @@ def genTranscodeTable(out, name, chars):
     printHexTable(out, 2, data)
     out.write('};\n\n')
 
-out = open(f'iso8859x.inc', 'w')
+out = open(f'codegen/charset.inc', 'w')
 
 out.write('''/*
  * Lookup tables for transcoding of 8-bit character sets.
diff --git a/codegen/genEscape.py b/codegen/genEscape.py
new file mode 100755
index 00000000..03bf383d
--- /dev/null
+++ b/codegen/genEscape.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+entities = [
+    [ '',   '&#xFFFD;' ],
+    [ '\t', '&#9;' ],
+    [ '\n', '&#10;' ],
+    [ '\r', '&#13;' ],
+    [ '"',  '&quot;' ],
+    [ '&',  '&amp;' ],
+    [ '<',  '&lt;' ],
+    [ '>',  '&gt;' ],
+]
+
+offset = [ None ] * 128
+
+def gen_content(out):
+    pos = 0
+    r = ''
+
+    for rec in entities:
+        char, repl = rec
+
+        if char:
+            offset[ord(char)] = pos
+
+        if pos % 12 == 0: r += '\n    '
+        else: r += ' '
+        r += '%3d,' % len(repl)
+        pos += 1
+
+        for c in repl:
+            if pos % 12 == 0: r += '\n    '
+            else: r += ' '
+            r += "'%s'," % c
+            pos += 1
+
+    out.write('static const char xmlEscapeContent[] = {%s\n};\n\n' % r)
+
+def gen_tab(out, name, escape, is_xml):
+    r = ''
+
+    for i in range(0x80):
+
+        if chr(i) in escape:
+            v = offset[i]
+        elif i == 0:
+            v = 0
+        elif is_xml and i < 32 and i != 9 and i != 10:
+            v = 0
+        else:
+            v = -1
+
+        if i % 16 == 0: r += '\n    '
+        else: r += ' '
+        r += '%2d,' % v
+
+    out.write('static const signed char %s[128] = {%s\n};\n\n' % (name, r))
+
+with open('codegen/escape.inc', 'w') as out:
+    gen_content(out)
+
+    gen_tab(out, 'xmlEscapeTab', '\r&<>', True)
+    gen_tab(out, 'xmlEscapeTabQuot', '\r"&<>', True)
+    gen_tab(out, 'xmlEscapeTabAttr', '\t\n\r"&<>', True)
+
+    out.write('#ifdef LIBXML_HTML_ENABLED\n\n')
+    gen_tab(out, 'htmlEscapeTab', '&<>', False)
+    gen_tab(out, 'htmlEscapeTabAttr', '"&', False)
+    out.write('#endif /* LIBXML_HTML_ENABLED */\n')
diff --git a/tools/genHtmlEnt.py b/codegen/genHtml5Ent.py
similarity index 93%
rename from tools/genHtmlEnt.py
rename to codegen/genHtml5Ent.py
index f87a570f..e3be9f1d 100755
--- a/tools/genHtmlEnt.py
+++ b/codegen/genHtml5Ent.py
@@ -162,8 +162,9 @@ def gen_table(ctype, cname, values, fmt, elems_per_line):
         else: r += ' '
         r += fmt % values[i]
 
-    return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n'
+    return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n\n'
 
-print(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15))
-print(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10))
-print(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15))
+with open('codegen/html5ent.inc', 'w') as out:
+    out.write(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15))
+    out.write(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10))
+    out.write(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15))
diff --git a/tools/genHtml5LibTests.py b/codegen/genHtml5LibTests.py
old mode 100644
new mode 100755
similarity index 100%
rename from tools/genHtml5LibTests.py
rename to codegen/genHtml5LibTests.py
diff --git a/tools/gentest.py b/codegen/genTestApi.py
old mode 100644
new mode 100755
similarity index 81%
rename from tools/gentest.py
rename to codegen/genTestApi.py
index 8efa03a6..ba9f688c
--- a/tools/gentest.py
+++ b/codegen/genTestApi.py
@@ -13,40 +13,40 @@ import xmlmod
 # Globals
 
 dtors = {
-    'htmlDocPtr': 'xmlFreeDoc',
-    'htmlParserCtxtPtr': 'htmlFreeParserCtxt',
-    'xmlAutomataPtr': 'xmlFreeAutomata',
-    'xmlBufferPtr': 'xmlBufferFree',
-    'xmlCatalogPtr': 'xmlFreeCatalog',
+    'htmlDoc *': 'xmlFreeDoc',
+    'htmlParserCtxt *': 'htmlFreeParserCtxt',
+    'xmlAutomata *': 'xmlFreeAutomata',
+    'xmlBuffer *': 'xmlBufferFree',
+    'xmlCatalog *': 'xmlFreeCatalog',
     'xmlChar *': 'xmlFree',
-    'xmlDOMWrapCtxtPtr': 'xmlDOMWrapFreeCtxt',
-    'xmlDictPtr': 'xmlDictFree',
-    'xmlDocPtr': 'xmlFreeDoc',
-    'xmlDtdPtr': 'xmlFreeDtd',
-    'xmlEntitiesTablePtr': 'xmlFreeEntitiesTable',
-    'xmlEnumerationPtr': 'xmlFreeEnumeration',
-    'xmlListPtr': 'xmlListDelete',
-    'xmlModulePtr': 'xmlModuleFree',
-    'xmlMutexPtr': 'xmlFreeMutex',
-    'xmlNodePtr': 'xmlFreeNode',
-    'xmlNodeSetPtr': 'xmlXPathFreeNodeSet',
-    'xmlNsPtr': 'xmlFreeNs',
-    'xmlOutputBufferPtr': 'xmlOutputBufferClose',
-    'xmlParserCtxtPtr': 'xmlFreeParserCtxt',
-    'xmlParserInputBufferPtr': 'xmlFreeParserInputBuffer',
-    'xmlParserInputPtr': 'xmlFreeInputStream',
-    'xmlRMutexPtr': 'xmlFreeRMutex',
-    'xmlRelaxNGValidCtxtPtr': 'xmlRelaxNGFreeValidCtxt',
-    'xmlSaveCtxtPtr': 'xmlSaveClose',
-    'xmlSchemaFacetPtr': 'xmlSchemaFreeFacet',
-    'xmlSchemaValPtr': 'xmlSchemaFreeValue',
-    'xmlSchemaValidCtxtPtr': 'xmlSchemaFreeValidCtxt',
-    'xmlTextWriterPtr': 'xmlFreeTextWriter',
-    'xmlURIPtr': 'xmlFreeURI',
-    'xmlValidCtxtPtr': 'xmlFreeValidCtxt',
-    'xmlXPathContextPtr': 'xmlXPathFreeContext',
-    'xmlXPathParserContextPtr': 'xmlXPathFreeParserContext',
-    'xmlXPathObjectPtr': 'xmlXPathFreeObject',
+    'xmlDOMWrapCtxt *': 'xmlDOMWrapFreeCtxt',
+    'xmlDict *': 'xmlDictFree',
+    'xmlDoc *': 'xmlFreeDoc',
+    'xmlDtd *': 'xmlFreeDtd',
+    'xmlEntitiesTable *': 'xmlFreeEntitiesTable',
+    'xmlEnumeration *': 'xmlFreeEnumeration',
+    'xmlList *': 'xmlListDelete',
+    'xmlModule *': 'xmlModuleFree',
+    'xmlMutex *': 'xmlFreeMutex',
+    'xmlNode *': 'xmlFreeNode',
+    'xmlNodeSet *': 'xmlXPathFreeNodeSet',
+    'xmlNs *': 'xmlFreeNs',
+    'xmlOutputBuffer *': 'xmlOutputBufferClose',
+    'xmlParserCtxt *': 'xmlFreeParserCtxt',
+    'xmlParserInputBuffer *': 'xmlFreeParserInputBuffer',
+    'xmlParserInput *': 'xmlFreeInputStream',
+    'xmlRMutex *': 'xmlFreeRMutex',
+    'xmlRelaxNGValidCtxt *': 'xmlRelaxNGFreeValidCtxt',
+    'xmlSaveCtxt *': 'xmlSaveClose',
+    'xmlSchemaFacet *': 'xmlSchemaFreeFacet',
+    'xmlSchemaVal *': 'xmlSchemaFreeValue',
+    'xmlSchemaValidCtxt *': 'xmlSchemaFreeValidCtxt',
+    'xmlTextWriter *': 'xmlFreeTextWriter',
+    'xmlURI *': 'xmlFreeURI',
+    'xmlValidCtxt *': 'xmlFreeValidCtxt',
+    'xmlXPathContext *': 'xmlXPathFreeContext',
+    'xmlXPathParserContext *': 'xmlXPathFreeParserContext',
+    'xmlXPathObject *': 'xmlXPathFreeObject',
 }
 
 blockList = {
@@ -194,7 +194,7 @@ for file in os.listdir(xmlDocDir):
             dtor = dtors.get(rtype)
             if dtor is not None:
                 code = f'{dtor}({code})'
-            elif rtype == 'xmlHashTablePtr':
+            elif rtype == 'xmlHashTable *':
                 code = f'xmlHashFree({code}, NULL)'
 
             mmfunc[name] = f'    {code};\n'
diff --git a/tools/genUnicode.py b/codegen/genUnicode.py
similarity index 90%
rename from tools/genUnicode.py
rename to codegen/genUnicode.py
index 67fef622..de881f43 100755
--- a/tools/genUnicode.py
+++ b/codegen/genUnicode.py
@@ -11,10 +11,8 @@
 #
 import sys
 import string
-import time
 
 webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
-sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
 
 #
 # blockAliases is a small hack - it is used for mapping block names which
@@ -31,7 +29,8 @@ blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
 # number, inline comparisons are generated
 minTableSize = 8
 
-(blockfile, catfile) = sources.split()
+blockfile = "Blocks-4.0.1.txt"
+catfile = "UnicodeData-4.0.1.txt"
 
 
 #
@@ -197,8 +196,6 @@ except:
     print("Failed to open xmlunicode.c")
     sys.exit(1)
 
-date = time.asctime(time.localtime(time.time()))
-
 output.write(
 """/*
  * xmlunicode.c: this module implements the Unicode character APIs
@@ -207,9 +204,6 @@ output.write(
  * UCS description files of the Unicode Character Database
  * %s
  * using the genUnicode.py Python script.
- *
- * Generation date: %s
- * Sources: %s
  */
 
 #define IN_LIBXML
@@ -238,7 +232,7 @@ typedef struct {
 
 static xmlIntFunc *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname);
 
-""" % (webpage, date, sources));
+""" % webpage);
 
 #
 # For any categories with more than minTableSize ranges we generate
@@ -281,13 +275,11 @@ for name in ckeys:
 
 output.write(
 """/**
- * xmlUnicodeLookup:
- * @tptr: pointer to the name table
- * @tname: name to be found
- *
  * binary table lookup for user-supplied name
  *
- * Returns pointer to range function if found, otherwise NULL
+ * @param tptr  pointer to the name table
+ * @param tname  name to be found
+ * @returns pointer to range function if found, otherwise NULL
  */
 static xmlIntFunc
 *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname) {
@@ -316,10 +308,10 @@ static xmlIntFunc
 
 for block in bkeys:
     name = block.replace('-', '')
-    output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
-    output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
+    output.write("/**\n * Check whether the character is part of %s UCS Block\n"%
                  (block))
-    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
+    output.write(" *\n * @param code  UCS code point\n")
+    output.write(" * @returns 1 if true 0 otherwise\n */\n");
     output.write("static int\nxmlUCSIs%s(int code) {\n    return(" % name)
     flag = 0
     for (start, end) in BlockNames[block]:
@@ -332,10 +324,10 @@ for block in bkeys:
 
 for name in ckeys:
     ranges = Categories[name]
-    output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
-    output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
+    output.write("/**\n * Check whether the character is part of %s UCS Category\n"%
                  (name))
-    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
+    output.write(" *\n * @param code  UCS code point\n")
+    output.write(" * @returns 1 if true 0 otherwise\n */\n");
     output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
     if len(Categories[name]) > minTableSize:
         output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
@@ -385,13 +377,11 @@ static const xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
 static const xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
 
 /**
- * xmlUCSIsBlock:
- * @code: UCS code point
- * @block: UCS block name
- *
  * Check whether the character is part of the UCS Block
  *
- * Returns 1 if true, 0 if false and -1 on unknown block
+ * @param code  UCS code point
+ * @param block  UCS block name
+ * @returns 1 if true, 0 if false and -1 on unknown block
  */
 int
 xmlUCSIsBlock(int code, const char *block) {
@@ -404,13 +394,11 @@ xmlUCSIsBlock(int code, const char *block) {
 }
 
 /**
- * xmlUCSIsCat:
- * @code: UCS code point
- * @cat: UCS Category name
- *
  * Check whether the character is part of the UCS Category
  *
- * Returns 1 if true, 0 if false and -1 on unknown category
+ * @param code  UCS code point
+ * @param cat  UCS Category name
+ * @returns 1 if true, 0 if false and -1 on unknown category
  */
 int
 xmlUCSIsCat(int code, const char *cat) {
diff --git a/html5ent.inc b/codegen/html5ent.inc
similarity index 100%
rename from html5ent.inc
rename to codegen/html5ent.inc
diff --git a/tools/xmlmod.py b/codegen/xmlmod.py
similarity index 100%
rename from tools/xmlmod.py
rename to codegen/xmlmod.py
diff --git a/encoding.c b/encoding.c
index 659f93a3..4e4b6359 100644
--- a/encoding.c
+++ b/encoding.c
@@ -279,7 +279,7 @@ UTF8ToHtmlWrapper(void *vctxt, unsigned char *out, int *outlen,
 #define UTF8ToHtmlWrapper NULL
 #endif
 
-#include "iso8859x.inc"
+#include "codegen/charset.inc"
 
 static xmlCharEncError
 EightBitToUtf8(void *vctxt, unsigned char* out, int *outlen,
diff --git a/python/generator.py b/python/generator.py
index 63c48446..112367b6 100755
--- a/python/generator.py
+++ b/python/generator.py
@@ -350,7 +350,7 @@ skipped_types = {
 import os
 import xml.etree.ElementTree as etree
 
-sys.path.append(srcPref + '/../tools')
+sys.path.append(srcPref + '/../codegen')
 import xmlmod
 
 xmlDocDir = dstPref + '/../doc/xml'
diff --git a/tools/genEscape.py b/tools/genEscape.py
deleted file mode 100755
index fbd12c90..00000000
--- a/tools/genEscape.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env python3
-
-entities = [
-    [ '',   '&#xFFFD;' ],
-    [ '\t', '&#9;' ],
-    [ '\n', '&#10;' ],
-    [ '\r', '&#13;' ],
-    [ '"',  '&quot;' ],
-    [ '&',  '&amp;' ],
-    [ '<',  '&lt;' ],
-    [ '>',  '&gt;' ],
-]
-
-### xmlEscapeContent
-
-offset = [ None ] * 128
-pos = 0
-r = ''
-
-for rec in entities:
-    char, repl = rec
-
-    if char:
-        offset[ord(char)] = pos
-
-    if pos % 12 == 0: r += '\n    '
-    else: r += ' '
-    r += '%3d,' % len(repl)
-    pos += 1
-
-    for c in repl:
-        if pos % 12 == 0: r += '\n    '
-        else: r += ' '
-        r += "'%s'," % c
-        pos += 1
-
-print('static const char xmlEscapeContent[] = {%s\n};\n' % r)
-
-def gen_tab(name, escape, is_xml):
-    r = ''
-
-    for i in range(0x80):
-
-        if chr(i) in escape:
-            v = offset[i]
-        elif i == 0:
-            v = 0
-        elif is_xml and i < 32 and i != 9 and i != 10:
-            v = 0
-        else:
-            v = -1
-
-        if i % 16 == 0: r += '\n    '
-        else: r += ' '
-        r += '%2d,' % v
-
-    print('static const signed char %s[128] = {%s\n};\n' % (name, r))
-
-gen_tab('xmlEscapeTab', '\r&<>', True)
-gen_tab('xmlEscapeTabQuot', '\r"&<>', True)
-gen_tab('xmlEscapeTabAttr', '\t\n\r"&<>', True)
-
-print('#ifdef LIBXML_HTML_ENABLED\n')
-gen_tab('htmlEscapeTab', '&<>', False)
-gen_tab('htmlEscapeTabAttr', '"&', False)
-print('#endif /* LIBXML_HTML_ENABLED */')
diff --git a/xmlIO.c b/xmlIO.c
index 6c9449b2..7f548020 100644
--- a/xmlIO.c
+++ b/xmlIO.c
@@ -159,76 +159,7 @@ xmlSerializeHexCharRef(char *buf, int val) {
     return(out - buf);
 }
 
-/*
- * Tables generated with tools/genEscape.py
- */
-
-static const char xmlEscapeContent[] = {
-      8, '&', '#', 'x', 'F', 'F', 'F', 'D', ';',   4, '&', '#',
-    '9', ';',   5, '&', '#', '1', '0', ';',   5, '&', '#', '1',
-    '3', ';',   6, '&', 'q', 'u', 'o', 't', ';',   5, '&', 'a',
-    'm', 'p', ';',   4, '&', 'l', 't', ';',   4, '&', 'g', 't',
-    ';',
-};
-
-static const signed char xmlEscapeTab[128] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,  0, 20,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-    -1, -1, -1, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-static const signed char xmlEscapeTabQuot[128] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,  0, 20,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-    -1, -1, 26, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-static const signed char xmlEscapeTabAttr[128] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  9, 14,  0,  0, 20,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-    -1, -1, 26, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-#ifdef LIBXML_HTML_ENABLED
-
-static const signed char htmlEscapeTab[128] = {
-     0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-static const signed char htmlEscapeTabAttr[128] = {
-     0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, 26, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-#endif /* LIBXML_HTML_ENABLED */
+#include "codegen/escape.inc"
 
 /*
  * @param text  input text