libxml2/codegen/genUnicode.py

#!/usr/bin/env python3
#
# Original script modified in November 2003 to take advantage of
# the character-validation range routines, and updated to the
# current Unicode information (Version 4.0.1)
#
# NOTE: there is an 'alias' facility for blocks which are not present in
#	the current release, but are needed for ABI compatibility.  This
#	must be accomplished MANUALLY!  Please see the comments below under
#     'blockAliases'
#
import sys
import string
import rangetab

#
# blockAliases is a small hack - it is used for mapping block names which
# were were used in the 3.1 release, but are missing or changed in the current
# release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
blockAliases = []
blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
blockAliases.append("Greek:GreekandCoptic")
blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
	"SupplementaryPrivateUseArea-B")

# minTableSize gives the minimum number of ranges which must be present
# before a range table is produced.  If there are less than this
# number, inline comparisons are generated
minTableSize = 8

blockfile = "Blocks-4.0.1.txt"
catfile = "UnicodeData-4.0.1.txt"


#
# Now process the "blocks" file, reducing it to a dictionary
# indexed by blockname, containing a tuple with the applicable
# block range
#
BlockNames = {}
try:
    blocks = open(blockfile, "r")
except:
    print("Missing %s, aborting ..." % blockfile)
    sys.exit(1)

for line in blocks.readlines():
    if line[0] == '#':
        continue
    line = line.strip()
    if line == '':
        continue
    try:
        fields = line.split(';')
        range = fields[0].strip()
        (start, end) = range.split("..")
        name = fields[1].strip()
        name = name.replace(' ', '')
    except:
        print("Failed to process line: %s" % (line))
        continue
    start = int(start, 16)
    end = int(end, 16)
    try:
        BlockNames[name].append((start, end))
    except:
        BlockNames[name] = [(start, end)]
blocks.close()
print("Parsed %d blocks descriptions" % (len(BlockNames.keys())))

for block in blockAliases:
    alias = block.split(':')
    alist = alias[1].split(',')
    for comp in alist:
        if comp in BlockNames:
            if alias[0] not in BlockNames:
                BlockNames[alias[0]] = []
            for r in BlockNames[comp]:
                BlockNames[alias[0]].append(r)
        else:
            print("Alias %s: %s not in Blocks" % (alias[0], comp))
            continue

#
# Next process the Categories file. This is more complex, since
# the file is in code sequence, and we need to invert it.  We use
# a dictionary with index category-name, with each entry containing
# all the ranges (codepoints) of that category.  Note that category
# names comprise two parts - the general category, and the "subclass"
# within that category.  Therefore, both "general category" (which is
# the first character of the 2-character category-name) and the full
# (2-character) name are entered into this dictionary.
#
try:
    data = open(catfile, "r")
except:
    print("Missing %s, aborting ..." % catfile)
    sys.exit(1)

nbchar = 0;
Categories = {}
for line in data.readlines():
    if line[0] == '#':
        continue
    line = line.strip()
    if line == '':
        continue
    try:
        fields = line.split(';')
        point = fields[0].strip()
        value = 0
        while point != '':
            value = value * 16
            if point[0] >= '0' and point[0] <= '9':
                value = value + ord(point[0]) - ord('0')
            elif point[0] >= 'A' and point[0] <= 'F':
                value = value + 10 + ord(point[0]) - ord('A')
            elif point[0] >= 'a' and point[0] <= 'f':
                value = value + 10 + ord(point[0]) - ord('a')
            point = point[1:]
        name = fields[2]
    except:
        print("Failed to process line: %s" % (line))
        continue

    nbchar = nbchar + 1
    # update entry for "full name"
    try:
        Categories[name].append(value)
    except:
        try:
            Categories[name] = [value]
        except:
            print("Failed to process line: %s" % (line))
    # update "general category" name
    try:
        Categories[name[0]].append(value)
    except:
        try:
            Categories[name[0]] = [value]
        except:
            print("Failed to process line: %s" % (line))

data.close()
print("Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())))

#
# The data is now all read.  Time to process it into a more useful form.
#
# reduce the number list into ranges
for cat in Categories.keys():
    list = Categories[cat]
    start = -1
    prev = -1
    end = -1
    ranges = []
    for val in list:
        if start == -1:
            start = val
            prev = val
            continue
        elif val == prev + 1:
            prev = val
            continue
        elif prev == start:
            ranges.append((prev, prev))
            start = val
            prev = val
            continue
        else:
            ranges.append((start, prev))
            start = val
            prev = val
            continue
    if prev == start:
        ranges.append((prev, prev))
    else:
        ranges.append((start, prev))
    Categories[cat] = ranges

#
# Assure all data is in alphabetic order, since we will be doing binary
# searches on the tables.
#
bkeys = sorted(BlockNames.keys())

ckeys = sorted(Categories.keys())

#
# Generate the resulting files
#
try:
    output = open("codegen/unicode.inc", "w")
except:
    print("Failed to open codegen/unicode.inc")
    sys.exit(1)

#
# For any categories with more than minTableSize ranges we generate
# a range table suitable for xmlCharInRange
#
for name in ckeys:
    if len(Categories[name]) <= minTableSize or name == 'Cs':
        continue
    ranges = Categories[name]
    group = rangetab.gen_range_tables(output, 'xml' + name, 'S', 'L', ranges)
    output.write("static const xmlChRangeGroup xml%sG = %s;\n\n" %
                 (name, group))

for name in ckeys:
    if name == 'Cs':
        continue
    ranges = Categories[name]
    output.write("static int\nxmlUCSIsCat%s(int code) {\n" % name)
    if len(Categories[name]) > minTableSize:
        output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
            % name)
    else:
        start = 1
        for range in ranges:
            (begin, end) = range;
            if start:
                output.write("    return(");
                start = 0
            else:
                output.write(" ||\n           ");
            if (begin == end):
                output.write("(code == %s)" % (hex(begin)))
            else:
                output.write("((code >= %s) && (code <= %s))" % (
                         hex(begin), hex(end)))
    output.write(");\n}\n\n")

#
# Range tables for blocks
#

blockGroups = ''
for block in bkeys:
    name = block.replace('-', '')
    ranges = BlockNames[block]
    group = rangetab.gen_range_tables(output, 'xml' + name, 'S', 'L', ranges)
    output.write("\n")
    if blockGroups != '':
        blockGroups += ",\n"
    blockGroups += '  {"%s",\n   %s}' % (block, group)

output.write("static const xmlUnicodeRange xmlUnicodeBlocks[] = {\n")
output.write(blockGroups)
output.write("\n};\n\n")

output.close()