1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-26 00:37:43 +03:00

codegen: Merge xmlunicode.c into xmlregexp.c

Include generated parts.

Generate xmlChRangeGroups instead of functions for Unicode blocks.
This commit is contained in:
Nick Wellnhofer
2025-05-16 02:12:23 +02:00
parent 4cb767e96e
commit c4926b19d3
8 changed files with 723 additions and 2182 deletions

View File

@@ -12,8 +12,6 @@
import sys
import string
webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
#
# blockAliases is a small hack - it is used for mapping block names which
# were were used in the 3.1 release, but are missing or changed in the current
@@ -60,8 +58,8 @@ for line in blocks.readlines():
except:
print("Failed to process line: %s" % (line))
continue
start = "0x" + start
end = "0x" + end
start = int(start, 16)
end = int(end, 16)
try:
BlockNames[name].append((start, end))
except:
@@ -142,7 +140,7 @@ for line in data.readlines():
except:
print("Failed to process line: %s" % (line))
blocks.close()
data.close()
print("Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())))
#
@@ -191,55 +189,17 @@ ckeys = sorted(Categories.keys())
# Generate the resulting files
#
try:
output = open("xmlunicode.c", "w")
output = open("codegen/unicode.inc", "w")
except:
print("Failed to open xmlunicode.c")
print("Failed to open codegen/unicode.inc")
sys.exit(1)
output.write(
"""/*
* xmlunicode.c: this module implements the Unicode character APIs
*
* This file is automatically generated from the
* UCS description files of the Unicode Character Database
* %s
* using the genUnicode.py Python script.
*/
#define IN_LIBXML
#include "libxml.h"
#ifdef LIBXML_REGEXP_ENABLED
#include <string.h>
#include <libxml/xmlversion.h>
#include <libxml/chvalid.h>
#include "private/unicode.h"
typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
typedef struct {
const char *rangename;
xmlIntFunc *func;
} xmlUnicodeRange;
typedef struct {
const xmlUnicodeRange *table;
int numentries;
} xmlUnicodeNameTable;
static xmlIntFunc *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname);
""" % webpage);
#
# For any categories with more than minTableSize ranges we generate
# a range table suitable for xmlCharInRange
#
for name in ckeys:
if len(Categories[name]) > minTableSize:
if len(Categories[name]) > minTableSize and name != 'Cs':
numshort = 0
numlong = 0
ranges = Categories[name]
@@ -257,7 +217,7 @@ for name in ckeys:
else:
if numlong == 0:
if numshort > 0:
output.write(pline + " };\n")
output.write(pline + "};\n")
pline = "static const xmlChLRange xml%sL[] = {" % name
lptr = "xml%sL" % name
else:
@@ -269,66 +229,14 @@ for name in ckeys:
elif pline[-1:] == ",":
pline += " "
pline += "{%s, %s}" % (hex(low), hex(high))
output.write(pline + " };\nstatic const xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
output.write(pline + "};\nstatic const xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
% (name, numshort, numlong, sptr, lptr))
output.write(
"""/**
* binary table lookup for user-supplied name
*
* @param tptr pointer to the name table
* @param tname name to be found
* @returns pointer to range function if found, otherwise NULL
*/
static xmlIntFunc
*xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname) {
int low, high, mid, cmp;
const xmlUnicodeRange *sptr;
if ((tptr == NULL) || (tname == NULL)) return(NULL);
low = 0;
high = tptr->numentries - 1;
sptr = tptr->table;
while (low <= high) {
mid = (low + high) / 2;
cmp = strcmp(tname, sptr[mid].rangename);
if (cmp == 0)
return (sptr[mid].func);
if (cmp < 0)
high = mid - 1;
else
low = mid + 1;
}
return (NULL);
}
""")
for block in bkeys:
name = block.replace('-', '')
output.write("/**\n * Check whether the character is part of %s UCS Block\n"%
(block))
output.write(" *\n * @param code UCS code point\n")
output.write(" * @returns 1 if true 0 otherwise\n */\n");
output.write("static int\nxmlUCSIs%s(int code) {\n return(" % name)
flag = 0
for (start, end) in BlockNames[block]:
if flag:
output.write(" ||\n ")
else:
flag = 1
output.write("((code >= %s) && (code <= %s))" % (start, end))
output.write(");\n}\n\n")
for name in ckeys:
if name == 'Cs':
continue
ranges = Categories[name]
output.write("/**\n * Check whether the character is part of %s UCS Category\n"%
(name))
output.write(" *\n * @param code UCS code point\n")
output.write(" * @returns 1 if true 0 otherwise\n */\n");
output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
output.write("static int\nxmlUCSIsCat%s(int code) {\n" % name)
if len(Categories[name]) > minTableSize:
output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
% name)
@@ -348,69 +256,53 @@ for name in ckeys:
hex(begin), hex(end)))
output.write(");\n}\n\n")
output.write(
"""static const xmlUnicodeRange xmlUnicodeBlocks[] = {""")
#
# Range tables for blocks
#
blockGroups = ''
flag = 0
for block in bkeys:
name = block.replace('-', '')
if flag:
output.write(',\n')
else:
numshort = 0
numlong = 0
ranges = BlockNames[block]
sptr = "NULL"
lptr = "NULL"
for range in ranges:
(low, high) = range
if high < 0x10000:
if numshort == 0:
pline = "static const xmlChSRange xml%sS[] = {" % name
sptr = "xml%sS" % name
else:
pline += ","
numshort += 1
else:
if numlong == 0:
if numshort > 0:
output.write(pline + "};\n")
pline = "static const xmlChLRange xml%sL[] = {" % name
lptr = "xml%sL" % name
else:
pline += ","
numlong += 1
if len(pline) > 60:
output.write(pline + "\n")
pline = " "
elif pline[-1:] == ",":
pline += " "
pline += "{%s, %s}" % (hex(low), hex(high))
output.write(pline + "};\n\n")
if flag == 0:
flag = 1
output.write(' {"%s", xmlUCSIs%s}' % (block, name))
output.write('};\n\n')
output.write('static const xmlUnicodeRange xmlUnicodeCats[] = {\n')
flag = 0;
for name in ckeys:
if flag:
output.write(',\n')
else:
flag = 1
output.write(' {"%s", xmlUCSIsCat%s}' % (name, name))
blockGroups += ",\n"
blockGroups += ' {"%s",\n {%s,%s,%s,%s}}' % (block, numshort, numlong,
sptr, lptr)
output.write(
"""};
static const xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
static const xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
/**
* Check whether the character is part of the UCS Block
*
* @param code UCS code point
* @param block UCS block name
* @returns 1 if true, 0 if false and -1 on unknown block
*/
int
xmlUCSIsBlock(int code, const char *block) {
xmlIntFunc *func;
func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
if (func == NULL)
return (-1);
return (func(code));
}
/**
* Check whether the character is part of the UCS Category
*
* @param code UCS code point
* @param cat UCS Category name
* @returns 1 if true, 0 if false and -1 on unknown category
*/
int
xmlUCSIsCat(int code, const char *cat) {
xmlIntFunc *func;
func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
if (func == NULL)
return (-1);
return (func(code));
}
#endif /* LIBXML_REGEXP_ENABLED */
""" % (len(BlockNames), len(Categories)))
output.write("static const xmlUnicodeRange xmlUnicodeBlocks[] = {\n")
output.write(blockGroups)
output.write("\n};\n\n")
output.close()