codegen: Merge xmlunicode.c into xmlregexp.c

Include generated parts. Generate xmlChRangeGroups instead of functions for Unicode blocks.
2025-10-26 00:37:43 +03:00 · 2025-05-16 02:12:23 +02:00
parent 4cb767e96e
commit c4926b19d3
8 changed files with 723 additions and 2182 deletions
--- a/codegen/genUnicode.py
+++ b/codegen/genUnicode.py
@@ -12,8 +12,6 @@
 import sys
 import string

-webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
-
 #
 # blockAliases is a small hack - it is used for mapping block names which
 # were were used in the 3.1 release, but are missing or changed in the current
@@ -60,8 +58,8 @@ for line in blocks.readlines():
    except:
        print("Failed to process line: %s" % (line))
        continue
-    start = "0x" + start
-    end = "0x" + end
+    start = int(start, 16)
+    end = int(end, 16)
    try:
        BlockNames[name].append((start, end))
    except:
@@ -142,7 +140,7 @@ for line in data.readlines():
        except:
            print("Failed to process line: %s" % (line))

-blocks.close()
+data.close()
 print("Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())))

 #
@@ -191,55 +189,17 @@ ckeys = sorted(Categories.keys())
 # Generate the resulting files
 #
 try:
-    output = open("xmlunicode.c", "w")
+    output = open("codegen/unicode.inc", "w")
 except:
-    print("Failed to open xmlunicode.c")
+    print("Failed to open codegen/unicode.inc")
    sys.exit(1)

-output.write(
-"""/*
- * xmlunicode.c: this module implements the Unicode character APIs
- *
- * This file is automatically generated from the
- * UCS description files of the Unicode Character Database
- * %s
- * using the genUnicode.py Python script.
- */
-
-#define IN_LIBXML
-#include "libxml.h"
-
-#ifdef LIBXML_REGEXP_ENABLED
-
-#include <string.h>
-#include <libxml/xmlversion.h>
-#include <libxml/chvalid.h>
-
-#include "private/unicode.h"
-
-typedef int (xmlIntFunc)(int);	/* just to keep one's mind untwisted */
-
-typedef struct {
-    const char *rangename;
-    xmlIntFunc *func;
-} xmlUnicodeRange;
-
-typedef struct {
-    const xmlUnicodeRange *table;
-    int		    numentries;
-} xmlUnicodeNameTable;
-
-
-static xmlIntFunc *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname);
-
-""" % webpage);
-
 #
 # For any categories with more than minTableSize ranges we generate
 # a range table suitable for xmlCharInRange
 #
 for name in ckeys:
-  if len(Categories[name]) > minTableSize:
+  if len(Categories[name]) > minTableSize and name != 'Cs':
    numshort = 0
    numlong = 0
    ranges = Categories[name]
@@ -257,7 +217,7 @@ for name in ckeys:
      else:
        if numlong == 0:
          if numshort > 0:
-            output.write(pline + " };\n")
+            output.write(pline + "};\n")
          pline = "static const xmlChLRange xml%sL[] = {" % name
          lptr = "xml%sL" % name
        else:
@@ -269,66 +229,14 @@ for name in ckeys:
      elif pline[-1:] == ",":
        pline += " "
      pline += "{%s, %s}" % (hex(low), hex(high))
-    output.write(pline + " };\nstatic const xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
+    output.write(pline + "};\nstatic const xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
         % (name, numshort, numlong, sptr, lptr))

-
-output.write(
-"""/**
- * binary table lookup for user-supplied name
- *
- * @param tptr  pointer to the name table
- * @param tname  name to be found
- * @returns pointer to range function if found, otherwise NULL
- */
-static xmlIntFunc
-*xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname) {
-    int low, high, mid, cmp;
-    const xmlUnicodeRange *sptr;
-
-    if ((tptr == NULL) || (tname == NULL)) return(NULL);
-
-    low = 0;
-    high = tptr->numentries - 1;
-    sptr = tptr->table;
-    while (low <= high) {
-	mid = (low + high) / 2;
-	cmp = strcmp(tname, sptr[mid].rangename);
-	if (cmp == 0)
-	    return (sptr[mid].func);
-	if (cmp < 0)
-	    high = mid - 1;
-	else
-	    low = mid + 1;
-    }
-    return (NULL);
-}
-
-""")
-
-for block in bkeys:
-    name = block.replace('-', '')
-    output.write("/**\n * Check whether the character is part of %s UCS Block\n"%
-                 (block))
-    output.write(" *\n * @param code  UCS code point\n")
-    output.write(" * @returns 1 if true 0 otherwise\n */\n");
-    output.write("static int\nxmlUCSIs%s(int code) {\n    return(" % name)
-    flag = 0
-    for (start, end) in BlockNames[block]:
-        if flag:
-            output.write(" ||\n           ")
-        else:
-            flag = 1
-        output.write("((code >= %s) && (code <= %s))" % (start, end))
-    output.write(");\n}\n\n")
-
 for name in ckeys:
+    if name == 'Cs':
+        continue
    ranges = Categories[name]
-    output.write("/**\n * Check whether the character is part of %s UCS Category\n"%
-                 (name))
-    output.write(" *\n * @param code  UCS code point\n")
-    output.write(" * @returns 1 if true 0 otherwise\n */\n");
-    output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
+    output.write("static int\nxmlUCSIsCat%s(int code) {\n" % name)
    if len(Categories[name]) > minTableSize:
        output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
            % name)
@@ -348,69 +256,53 @@ for name in ckeys:
                         hex(begin), hex(end)))
    output.write(");\n}\n\n")

-output.write(
-"""static const xmlUnicodeRange xmlUnicodeBlocks[] = {""")
+#
+# Range tables for blocks
+#

+blockGroups = ''
 flag = 0
 for block in bkeys:
    name = block.replace('-', '')
-    if flag:
-        output.write(',\n')
-    else:
+    numshort = 0
+    numlong = 0
+    ranges = BlockNames[block]
+    sptr = "NULL"
+    lptr = "NULL"
+    for range in ranges:
+        (low, high) = range
+        if high < 0x10000:
+            if numshort == 0:
+                pline = "static const xmlChSRange xml%sS[] = {" % name
+                sptr = "xml%sS" % name
+            else:
+                pline += ","
+            numshort += 1
+        else:
+            if numlong == 0:
+                if numshort > 0:
+                    output.write(pline + "};\n")
+                pline = "static const xmlChLRange xml%sL[] = {" % name
+                lptr = "xml%sL" % name
+            else:
+                pline += ","
+            numlong += 1
+        if len(pline) > 60:
+            output.write(pline + "\n")
+            pline = "    "
+        elif pline[-1:] == ",":
+            pline += " "
+        pline += "{%s, %s}" % (hex(low), hex(high))
+    output.write(pline + "};\n\n")
+    if flag == 0:
        flag = 1
-    output.write('  {"%s", xmlUCSIs%s}' % (block, name))
-output.write('};\n\n')
-
-output.write('static const xmlUnicodeRange xmlUnicodeCats[] = {\n')
-flag = 0;
-for name in ckeys:
-    if flag:
-        output.write(',\n')
    else:
-        flag = 1
-    output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
+        blockGroups += ",\n"
+    blockGroups += '  {"%s",\n   {%s,%s,%s,%s}}' % (block, numshort, numlong,
+                                                    sptr, lptr)

-output.write(
-"""};
-
-static const xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
-static const xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
-
-/**
- * Check whether the character is part of the UCS Block
- *
- * @param code  UCS code point
- * @param block  UCS block name
- * @returns 1 if true, 0 if false and -1 on unknown block
- */
-int
-xmlUCSIsBlock(int code, const char *block) {
-    xmlIntFunc *func;
-
-    func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
-    if (func == NULL)
-	return (-1);
-    return (func(code));
-}
-
-/**
- * Check whether the character is part of the UCS Category
- *
- * @param code  UCS code point
- * @param cat  UCS Category name
- * @returns 1 if true, 0 if false and -1 on unknown category
- */
-int
-xmlUCSIsCat(int code, const char *cat) {
-    xmlIntFunc *func;
-
-    func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
-    if (func == NULL)
-	return (-1);
-    return (func(code));
-}
-
-#endif /* LIBXML_REGEXP_ENABLED */
-""" % (len(BlockNames), len(Categories)))
+output.write("static const xmlUnicodeRange xmlUnicodeBlocks[] = {\n")
+output.write(blockGroups)
+output.write("\n};\n\n")

 output.close()