codegen: Consolidate tools for code generation

Move tools, source files and output tables into codegen directory. Rename some files. Adjust tools to match modified files. Remove generation date and source files from output. Distribute all tools and sources.
2025-07-30 22:43:14 +03:00 · 2025-05-15 17:49:49 +02:00
parent 0d34d690c4
commit 258d870629
18 changed files with 256 additions and 262 deletions
--- a/HTMLparser.c
+++ b/HTMLparser.c
@ -2432,7 +2432,7 @@ htmlCodePointToUtf8(int c, xmlChar *out, int *osize) {
    return(out);
 }

-#include "html5ent.inc"
+#include "codegen/html5ent.inc"

 #define ENT_F_SEMICOLON 0x80u
 #define ENT_F_SUBTABLE  0x40u
--- a/Makefile.am
+++ b/Makefile.am
@ -154,7 +154,7 @@ testdso_la_LDFLAGS = $(AM_LDFLAGS) \
 		     -module -no-undefined -avoid-version -rpath $(libdir)

 rebuild_testapi:
-	cd $(srcdir) && python3 tools/gentest.py $(abs_builddir)
+	cd $(srcdir) && python3 codegen/genTestApi.py $(abs_builddir)

 testapi_SOURCES=testapi.c
 testapi_DEPENDENCIES = $(DEPS)
@ -201,9 +201,18 @@ CLEANFILES = missing.lst runsuite.log runxmlconf.log test.out \

 EXTRA_DIST = Copyright libxml2-config.cmake.in autogen.sh \
 	     libxml.h \
-	     html5ent.inc iso8859x.inc \
-	     tools/gentest.py tools/xmlmod.py \
-	     tools/genChRanges.py tools/genEscape.py tools/genUnicode.py \
+	     codegen/charset.inc \
+	     codegen/chvalid.def \
+	     codegen/escape.inc \
+	     codegen/genCharset.py \
+	     codegen/genChRanges.py \
+	     codegen/genEscape.py \
+	     codegen/genHtml5Ent.py \
+	     codegen/genHtml5LibTests.py \
+	     codegen/genTestApi.py \
+	     codegen/genUnicode.py \
+	     codegen/html5ent.inc \
+	     codegen/xmlmod.py \
 	     timsort.h \
 	     README.zOS README.md \
 	     CMakeLists.txt config.h.cmake.in libxml2-config.cmake.cmake.in \
--- a/codegen/charset.inc
+++ b/codegen/charset.inc
--- a/codegen/chvalid.def
+++ b/codegen/chvalid.def
--- a/codegen/escape.inc
+++ b/codegen/escape.inc
@ -0,0 +1,66 @@
+static const char xmlEscapeContent[] = {
+      8, '&', '#', 'x', 'F', 'F', 'F', 'D', ';',   4, '&', '#',
+    '9', ';',   5, '&', '#', '1', '0', ';',   5, '&', '#', '1',
+    '3', ';',   6, '&', 'q', 'u', 'o', 't', ';',   5, '&', 'a',
+    'm', 'p', ';',   4, '&', 'l', 't', ';',   4, '&', 'g', 't',
+    ';',
+};
+
+static const signed char xmlEscapeTab[128] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,  0, 20,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    -1, -1, -1, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+static const signed char xmlEscapeTabQuot[128] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,  0, 20,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    -1, -1, 26, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+static const signed char xmlEscapeTabAttr[128] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  9, 14,  0,  0, 20,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    -1, -1, 26, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+#ifdef LIBXML_HTML_ENABLED
+
+static const signed char htmlEscapeTab[128] = {
+     0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+static const signed char htmlEscapeTabAttr[128] = {
+     0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, 26, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+#endif /* LIBXML_HTML_ENABLED */
--- a/codegen/genChRanges.py
+++ b/codegen/genChRanges.py
@ -16,7 +16,6 @@
 #

 import sys
-import time

 #
 # A routine to take a list of yes/no (1, 0) values and turn it
@ -41,8 +40,6 @@ def makeRange(lst):
        pos = e + 1                     # ready to check for next range
    return ret

-sources = "chvalid.def"                 # input filename
-
 # minTableSize gives the minimum number of ranges which must be present
 # before a 256-byte lookup table is produced.  If there are less than this
 # number, a macro with inline comparisons is generated
@ -54,9 +51,9 @@ Functs = {}
 state = 0

 try:
-    defines = open("chvalid.def", "r")
+    defines = open("codegen/chvalid.def", "r")
 except:
-    print("Missing chvalid.def, aborting ...")
+    print("Missing codegen/chvalid.def, aborting ...")
    sys.exit(1)

 #
@ -202,19 +199,19 @@ except:
    print("Failed to open chvalid.c")
    sys.exit(1)

-date = time.asctime(time.localtime(time.time()))
+fkeys = sorted(Functs.keys())

 header.write(
-"""/*
- * Summary: Unicode character range checking
- * Description: this module exports interfaces for the character
+"""/**
+ * @file
+ *
+ * @brief Unicode character range checking
+ *
+ * this module exports interfaces for the character
 *               range validation APIs
 *
 * This file is automatically generated from the cvs source
 * definition files using the genChRanges.py Python script
- *
- * Generation date: %s
- * Sources: %s
 */

 #ifndef __XML_CHVALID_H__
@ -227,6 +224,8 @@ header.write(
 extern "C" {
 #endif

+/** @cond ignore */
+
 /*
 * Define our typedefs and structures
 *
@ -254,13 +253,27 @@ struct _xmlChRangeGroup {
    const xmlChLRange\t*longRange;
 };

+""");
+
+for f in fkeys:
+    if len(Functs[f][1]) > 0:
+        header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f)
+    if max(Functs[f][0]) > 0:   # only check if at least one entry
+        rangeTable = makeRange(Functs[f][0])
+        numRanges = len(rangeTable)
+        if numRanges >= minTableSize:   # table is worthwhile
+            header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f)
+
+header.write("""
 /**
 * Range checking routine
 */
 XMLPUBFUN int
 \t\txmlCharInRange(unsigned int val, const xmlChRangeGroup *group);

-""" % (date, sources));
+/** @endcond */
+""");
+
 output.write(
 """/*
 * chvalid.c:\tthis module implements the character range
@ -268,9 +281,6 @@ output.write(
 *
 * This file is automatically generated from the cvs source
 * definition files using the genChRanges.py Python script
- *
- * Generation date: %s
- * Sources: %s
 */

 #define IN_LIBXML
@ -287,7 +297,7 @@ output.write(
 * allowed.
 *
 */
-""" % (date, sources));
+""");

 #
 # Now output the generated data.
@ -298,8 +308,6 @@ output.write(
 # compares, otherwise we output a 256-byte table and a macro to use it.
 #

-fkeys = sorted(Functs.keys())
-
 for f in fkeys:

 # First we convert the specified single-byte values into a group of ranges.
@ -310,15 +318,13 @@ for f in fkeys:
        rangeTable = makeRange(Functs[f][0])
        numRanges = len(rangeTable)
        if numRanges >= minTableSize:   # table is worthwhile
-            header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f)
            header.write("""
 /**
- * %s_ch:
- * @c: char to validate
- *
 * Automatically generated by genChRanges.py
+ *
+ * @param c  char to validate
 */
-""" % f)
+""")
            header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))

            # write the constant data to the code file
@ -343,12 +349,11 @@ for f in fkeys:

            header.write("""
 /**
- * %s_ch:
- * @c: char to validate
- *
 * Automatically generated by genChRanges.py
+ *
+ * @param c  char to validate
 */
-""" % f)
+""")
            # okay, I'm tired of the messy lineup - let's automate it!
            pline = "#define %s_ch(c)" % f
            # 'ntab' is number of tabs needed to position to col. 33 from name end
@ -378,12 +383,11 @@ for f in fkeys:

    header.write("""
 /**
- * %sQ:
- * @c: char to validate
- *
 * Automatically generated by genChRanges.py
+ *
+ * @param c  char to validate
 */
-""" % f)
+""")
    pline = "#define %sQ(c)" % f
    ntab = 4 - (len(pline)) // 8
    if ntab < 0:
@ -403,7 +407,7 @@ for f in fkeys:
        header.write(" 0)\n\n")
    else:
        if numRanges >= minTableSize:
-            header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n"  % f)
+            header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n"  % f)
        else:           # if < minTableSize, generate inline code
            firstFlag = 1
            for rg in Functs[f][1]:
@ -417,14 +421,10 @@ for f in fkeys:
                else:                           # value range
                    pline += "((0x%x <= (c)) &&" % rg[0]
                    pline += " ((c) <= 0x%x))" % rg[1]
-            pline += "))\n\n"
+            pline += "))\n"
            header.write(pline)


-    if len(Functs[f][1]) > 0:
-        header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f)
-
-
 #
 # Next we do the unicode ranges
 #
@ -477,14 +477,12 @@ for f in fkeys:
 output.write(
 """
 /**
- * xmlCharInRange:
- * @val: character to be validated
- * @rptr: pointer to range to be used to validate
- *
 * Does a binary search of the range table to determine if char
 * is valid
 *
- * Returns: true if character valid, false otherwise
+ * @param val  character to be validated
+ * @param rptr  pointer to range to be used to validate
+ * @returns true if character valid, false otherwise
 */
 int
 xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) {
@ -542,18 +540,16 @@ xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) {
 for f in fkeys:
    output.write("""
 /**
- * %s:
- * @ch:  character to validate
- *
 * This function is DEPRECATED.
-""" % f);
+""");
    if max(Functs[f][0]) > 0:
-        output.write(" * Use %s_ch or %sQ instead" % (f, f))
+        output.write(" * Use %s_ch() or %sQ() instead" % (f, f))
    else:
-        output.write(" * Use %sQ instead" % f)
+        output.write(" * Use %sQ() instead" % f)
    output.write("""
 *
- * Returns true if argument valid, false otherwise
+ * @param ch  character to validate
+ * @returns true if argument valid, false otherwise
 */
 """)
    output.write("int\n%s(unsigned int ch) {\n    return(%sQ(ch));\n}\n\n" % (f,f))
--- a/codegen/genCharset.py
+++ b/codegen/genCharset.py
@ -73,7 +73,7 @@ def genTranscodeTable(out, name, chars):
    printHexTable(out, 2, data)
    out.write('};\n\n')

-out = open(f'iso8859x.inc', 'w')
+out = open(f'codegen/charset.inc', 'w')

 out.write('''/*
 * Lookup tables for transcoding of 8-bit character sets.
--- a/codegen/genEscape.py
+++ b/codegen/genEscape.py
@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+entities = [
+    [ '',   '&#xFFFD;' ],
+    [ '\t', '&#9;' ],
+    [ '\n', '&#10;' ],
+    [ '\r', '&#13;' ],
+    [ '"',  '&quot;' ],
+    [ '&',  '&amp;' ],
+    [ '<',  '&lt;' ],
+    [ '>',  '&gt;' ],
+]
+
+offset = [ None ] * 128
+
+def gen_content(out):
+    pos = 0
+    r = ''
+
+    for rec in entities:
+        char, repl = rec
+
+        if char:
+            offset[ord(char)] = pos
+
+        if pos % 12 == 0: r += '\n    '
+        else: r += ' '
+        r += '%3d,' % len(repl)
+        pos += 1
+
+        for c in repl:
+            if pos % 12 == 0: r += '\n    '
+            else: r += ' '
+            r += "'%s'," % c
+            pos += 1
+
+    out.write('static const char xmlEscapeContent[] = {%s\n};\n\n' % r)
+
+def gen_tab(out, name, escape, is_xml):
+    r = ''
+
+    for i in range(0x80):
+
+        if chr(i) in escape:
+            v = offset[i]
+        elif i == 0:
+            v = 0
+        elif is_xml and i < 32 and i != 9 and i != 10:
+            v = 0
+        else:
+            v = -1
+
+        if i % 16 == 0: r += '\n    '
+        else: r += ' '
+        r += '%2d,' % v
+
+    out.write('static const signed char %s[128] = {%s\n};\n\n' % (name, r))
+
+with open('codegen/escape.inc', 'w') as out:
+    gen_content(out)
+
+    gen_tab(out, 'xmlEscapeTab', '\r&<>', True)
+    gen_tab(out, 'xmlEscapeTabQuot', '\r"&<>', True)
+    gen_tab(out, 'xmlEscapeTabAttr', '\t\n\r"&<>', True)
+
+    out.write('#ifdef LIBXML_HTML_ENABLED\n\n')
+    gen_tab(out, 'htmlEscapeTab', '&<>', False)
+    gen_tab(out, 'htmlEscapeTabAttr', '"&', False)
+    out.write('#endif /* LIBXML_HTML_ENABLED */\n')
--- a/codegen/genHtml5Ent.py
+++ b/codegen/genHtml5Ent.py
@ -162,8 +162,9 @@ def gen_table(ctype, cname, values, fmt, elems_per_line):
        else: r += ' '
        r += fmt % values[i]

-    return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n'
+    return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n\n'

-print(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15))
-print(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10))
-print(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15))
+with open('codegen/html5ent.inc', 'w') as out:
+    out.write(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15))
+    out.write(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10))
+    out.write(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15))
--- a/codegen/genHtml5LibTests.py
+++ b/codegen/genHtml5LibTests.py
--- a/codegen/genTestApi.py
+++ b/codegen/genTestApi.py
@ -13,40 +13,40 @@ import xmlmod
 # Globals

 dtors = {
-    'htmlDocPtr': 'xmlFreeDoc',
-    'htmlParserCtxtPtr': 'htmlFreeParserCtxt',
-    'xmlAutomataPtr': 'xmlFreeAutomata',
-    'xmlBufferPtr': 'xmlBufferFree',
-    'xmlCatalogPtr': 'xmlFreeCatalog',
+    'htmlDoc *': 'xmlFreeDoc',
+    'htmlParserCtxt *': 'htmlFreeParserCtxt',
+    'xmlAutomata *': 'xmlFreeAutomata',
+    'xmlBuffer *': 'xmlBufferFree',
+    'xmlCatalog *': 'xmlFreeCatalog',
    'xmlChar *': 'xmlFree',
-    'xmlDOMWrapCtxtPtr': 'xmlDOMWrapFreeCtxt',
-    'xmlDictPtr': 'xmlDictFree',
-    'xmlDocPtr': 'xmlFreeDoc',
-    'xmlDtdPtr': 'xmlFreeDtd',
-    'xmlEntitiesTablePtr': 'xmlFreeEntitiesTable',
-    'xmlEnumerationPtr': 'xmlFreeEnumeration',
-    'xmlListPtr': 'xmlListDelete',
-    'xmlModulePtr': 'xmlModuleFree',
-    'xmlMutexPtr': 'xmlFreeMutex',
-    'xmlNodePtr': 'xmlFreeNode',
-    'xmlNodeSetPtr': 'xmlXPathFreeNodeSet',
-    'xmlNsPtr': 'xmlFreeNs',
-    'xmlOutputBufferPtr': 'xmlOutputBufferClose',
-    'xmlParserCtxtPtr': 'xmlFreeParserCtxt',
-    'xmlParserInputBufferPtr': 'xmlFreeParserInputBuffer',
-    'xmlParserInputPtr': 'xmlFreeInputStream',
-    'xmlRMutexPtr': 'xmlFreeRMutex',
-    'xmlRelaxNGValidCtxtPtr': 'xmlRelaxNGFreeValidCtxt',
-    'xmlSaveCtxtPtr': 'xmlSaveClose',
-    'xmlSchemaFacetPtr': 'xmlSchemaFreeFacet',
-    'xmlSchemaValPtr': 'xmlSchemaFreeValue',
-    'xmlSchemaValidCtxtPtr': 'xmlSchemaFreeValidCtxt',
-    'xmlTextWriterPtr': 'xmlFreeTextWriter',
-    'xmlURIPtr': 'xmlFreeURI',
-    'xmlValidCtxtPtr': 'xmlFreeValidCtxt',
-    'xmlXPathContextPtr': 'xmlXPathFreeContext',
-    'xmlXPathParserContextPtr': 'xmlXPathFreeParserContext',
-    'xmlXPathObjectPtr': 'xmlXPathFreeObject',
+    'xmlDOMWrapCtxt *': 'xmlDOMWrapFreeCtxt',
+    'xmlDict *': 'xmlDictFree',
+    'xmlDoc *': 'xmlFreeDoc',
+    'xmlDtd *': 'xmlFreeDtd',
+    'xmlEntitiesTable *': 'xmlFreeEntitiesTable',
+    'xmlEnumeration *': 'xmlFreeEnumeration',
+    'xmlList *': 'xmlListDelete',
+    'xmlModule *': 'xmlModuleFree',
+    'xmlMutex *': 'xmlFreeMutex',
+    'xmlNode *': 'xmlFreeNode',
+    'xmlNodeSet *': 'xmlXPathFreeNodeSet',
+    'xmlNs *': 'xmlFreeNs',
+    'xmlOutputBuffer *': 'xmlOutputBufferClose',
+    'xmlParserCtxt *': 'xmlFreeParserCtxt',
+    'xmlParserInputBuffer *': 'xmlFreeParserInputBuffer',
+    'xmlParserInput *': 'xmlFreeInputStream',
+    'xmlRMutex *': 'xmlFreeRMutex',
+    'xmlRelaxNGValidCtxt *': 'xmlRelaxNGFreeValidCtxt',
+    'xmlSaveCtxt *': 'xmlSaveClose',
+    'xmlSchemaFacet *': 'xmlSchemaFreeFacet',
+    'xmlSchemaVal *': 'xmlSchemaFreeValue',
+    'xmlSchemaValidCtxt *': 'xmlSchemaFreeValidCtxt',
+    'xmlTextWriter *': 'xmlFreeTextWriter',
+    'xmlURI *': 'xmlFreeURI',
+    'xmlValidCtxt *': 'xmlFreeValidCtxt',
+    'xmlXPathContext *': 'xmlXPathFreeContext',
+    'xmlXPathParserContext *': 'xmlXPathFreeParserContext',
+    'xmlXPathObject *': 'xmlXPathFreeObject',
 }

 blockList = {
@ -194,7 +194,7 @@ for file in os.listdir(xmlDocDir):
            dtor = dtors.get(rtype)
            if dtor is not None:
                code = f'{dtor}({code})'
-            elif rtype == 'xmlHashTablePtr':
+            elif rtype == 'xmlHashTable *':
                code = f'xmlHashFree({code}, NULL)'

            mmfunc[name] = f'    {code};\n'
--- a/codegen/genUnicode.py
+++ b/codegen/genUnicode.py
@ -11,10 +11,8 @@
 #
 import sys
 import string
-import time

 webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
-sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"

 #
 # blockAliases is a small hack - it is used for mapping block names which
@ -31,7 +29,8 @@ blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
 # number, inline comparisons are generated
 minTableSize = 8

-(blockfile, catfile) = sources.split()
+blockfile = "Blocks-4.0.1.txt"
+catfile = "UnicodeData-4.0.1.txt"


 #
@ -197,8 +196,6 @@ except:
    print("Failed to open xmlunicode.c")
    sys.exit(1)

-date = time.asctime(time.localtime(time.time()))
-
 output.write(
 """/*
 * xmlunicode.c: this module implements the Unicode character APIs
@ -207,9 +204,6 @@ output.write(
 * UCS description files of the Unicode Character Database
 * %s
 * using the genUnicode.py Python script.
- *
- * Generation date: %s
- * Sources: %s
 */

 #define IN_LIBXML
@ -238,7 +232,7 @@ typedef struct {

 static xmlIntFunc *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname);

-""" % (webpage, date, sources));
+""" % webpage);

 #
 # For any categories with more than minTableSize ranges we generate
@ -281,13 +275,11 @@ for name in ckeys:

 output.write(
 """/**
- * xmlUnicodeLookup:
- * @tptr: pointer to the name table
- * @tname: name to be found
- *
 * binary table lookup for user-supplied name
 *
- * Returns pointer to range function if found, otherwise NULL
+ * @param tptr  pointer to the name table
+ * @param tname  name to be found
+ * @returns pointer to range function if found, otherwise NULL
 */
 static xmlIntFunc
 *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname) {
@ -316,10 +308,10 @@ static xmlIntFunc

 for block in bkeys:
    name = block.replace('-', '')
-    output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
-    output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
+    output.write("/**\n * Check whether the character is part of %s UCS Block\n"%
                 (block))
-    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
+    output.write(" *\n * @param code  UCS code point\n")
+    output.write(" * @returns 1 if true 0 otherwise\n */\n");
    output.write("static int\nxmlUCSIs%s(int code) {\n    return(" % name)
    flag = 0
    for (start, end) in BlockNames[block]:
@ -332,10 +324,10 @@ for block in bkeys:

 for name in ckeys:
    ranges = Categories[name]
-    output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
-    output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
+    output.write("/**\n * Check whether the character is part of %s UCS Category\n"%
                 (name))
-    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
+    output.write(" *\n * @param code  UCS code point\n")
+    output.write(" * @returns 1 if true 0 otherwise\n */\n");
    output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
    if len(Categories[name]) > minTableSize:
        output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
@ -385,13 +377,11 @@ static const xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
 static const xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};

 /**
- * xmlUCSIsBlock:
- * @code: UCS code point
- * @block: UCS block name
- *
 * Check whether the character is part of the UCS Block
 *
- * Returns 1 if true, 0 if false and -1 on unknown block
+ * @param code  UCS code point
+ * @param block  UCS block name
+ * @returns 1 if true, 0 if false and -1 on unknown block
 */
 int
 xmlUCSIsBlock(int code, const char *block) {
@ -404,13 +394,11 @@ xmlUCSIsBlock(int code, const char *block) {
 }

 /**
- * xmlUCSIsCat:
- * @code: UCS code point
- * @cat: UCS Category name
- *
 * Check whether the character is part of the UCS Category
 *
- * Returns 1 if true, 0 if false and -1 on unknown category
+ * @param code  UCS code point
+ * @param cat  UCS Category name
+ * @returns 1 if true, 0 if false and -1 on unknown category
 */
 int
 xmlUCSIsCat(int code, const char *cat) {
--- a/codegen/html5ent.inc
+++ b/codegen/html5ent.inc
--- a/codegen/xmlmod.py
+++ b/codegen/xmlmod.py
--- a/encoding.c
+++ b/encoding.c
@ -279,7 +279,7 @@ UTF8ToHtmlWrapper(void *vctxt, unsigned char *out, int *outlen,
 #define UTF8ToHtmlWrapper NULL
 #endif

-#include "iso8859x.inc"
+#include "codegen/charset.inc"

 static xmlCharEncError
 EightBitToUtf8(void *vctxt, unsigned char* out, int *outlen,
--- a/python/generator.py
+++ b/python/generator.py
@ -350,7 +350,7 @@ skipped_types = {
 import os
 import xml.etree.ElementTree as etree

-sys.path.append(srcPref + '/../tools')
+sys.path.append(srcPref + '/../codegen')
 import xmlmod

 xmlDocDir = dstPref + '/../doc/xml'
--- a/tools/genEscape.py
+++ b/tools/genEscape.py
@ -1,66 +0,0 @@
-#!/usr/bin/env python3
-
-entities = [
-    [ '',   '&#xFFFD;' ],
-    [ '\t', '&#9;' ],
-    [ '\n', '&#10;' ],
-    [ '\r', '&#13;' ],
-    [ '"',  '&quot;' ],
-    [ '&',  '&amp;' ],
-    [ '<',  '&lt;' ],
-    [ '>',  '&gt;' ],
-]
-
-### xmlEscapeContent
-
-offset = [ None ] * 128
-pos = 0
-r = ''
-
-for rec in entities:
-    char, repl = rec
-
-    if char:
-        offset[ord(char)] = pos
-
-    if pos % 12 == 0: r += '\n    '
-    else: r += ' '
-    r += '%3d,' % len(repl)
-    pos += 1
-
-    for c in repl:
-        if pos % 12 == 0: r += '\n    '
-        else: r += ' '
-        r += "'%s'," % c
-        pos += 1
-
-print('static const char xmlEscapeContent[] = {%s\n};\n' % r)
-
-def gen_tab(name, escape, is_xml):
-    r = ''
-
-    for i in range(0x80):
-
-        if chr(i) in escape:
-            v = offset[i]
-        elif i == 0:
-            v = 0
-        elif is_xml and i < 32 and i != 9 and i != 10:
-            v = 0
-        else:
-            v = -1
-
-        if i % 16 == 0: r += '\n    '
-        else: r += ' '
-        r += '%2d,' % v
-
-    print('static const signed char %s[128] = {%s\n};\n' % (name, r))
-
-gen_tab('xmlEscapeTab', '\r&<>', True)
-gen_tab('xmlEscapeTabQuot', '\r"&<>', True)
-gen_tab('xmlEscapeTabAttr', '\t\n\r"&<>', True)
-
-print('#ifdef LIBXML_HTML_ENABLED\n')
-gen_tab('htmlEscapeTab', '&<>', False)
-gen_tab('htmlEscapeTabAttr', '"&', False)
-print('#endif /* LIBXML_HTML_ENABLED */')
--- a/xmlIO.c
+++ b/xmlIO.c
@ -159,76 +159,7 @@ xmlSerializeHexCharRef(char *buf, int val) {
    return(out - buf);
 }

-/*
- * Tables generated with tools/genEscape.py
- */
-
-static const char xmlEscapeContent[] = {
-      8, '&', '#', 'x', 'F', 'F', 'F', 'D', ';',   4, '&', '#',
-    '9', ';',   5, '&', '#', '1', '0', ';',   5, '&', '#', '1',
-    '3', ';',   6, '&', 'q', 'u', 'o', 't', ';',   5, '&', 'a',
-    'm', 'p', ';',   4, '&', 'l', 't', ';',   4, '&', 'g', 't',
-    ';',
-};
-
-static const signed char xmlEscapeTab[128] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,  0, 20,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-    -1, -1, -1, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-static const signed char xmlEscapeTabQuot[128] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,  0, 20,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-    -1, -1, 26, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-static const signed char xmlEscapeTabAttr[128] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  9, 14,  0,  0, 20,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-    -1, -1, 26, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-#ifdef LIBXML_HTML_ENABLED
-
-static const signed char htmlEscapeTab[128] = {
-     0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, 44, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-static const signed char htmlEscapeTabAttr[128] = {
-     0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, 26, -1, -1, -1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-#endif /* LIBXML_HTML_ENABLED */
+#include "codegen/escape.inc"

 /*
 * @param text  input text