123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143 |
- #!/usr/bin/env python
- """
- Unicode case folding database conversion utility
- Parses the database and generates a C++ function which implements the case
- folding algorithm. The database entries are of the form:
- <code>; <status>; <mapping>; # <name>
- <status> can be one of four characters:
- C - Common mappings
- S - mappings for Simple case folding
- F - mappings for Full case folding
- T - special case for Turkish I characters
- Right now this generates a function which implements simple case folding (C+S
- entries).
- """
- from __future__ import print_function
- import sys
- import re
- try:
- from urllib.request import urlopen
- except ImportError:
- from urllib2 import urlopen
- # This variable will body of the mappings function
- body = ""
- # Reads file line-by-line, extracts Common and Simple case fold mappings and
- # returns a (from_char, to_char, from_name) tuple.
- def mappings(f):
- previous_from = -1
- expr = re.compile(r'^(.*); [CS]; (.*); # (.*)')
- for line in f:
- m = expr.match(line)
- if not m: continue
- from_char = int(m.group(1), 16)
- to_char = int(m.group(2), 16)
- from_name = m.group(3)
- if from_char <= previous_from:
- raise Exception("Duplicate or unsorted characters in input")
- yield from_char, to_char, from_name
- previous_from = from_char
- # Computes the shift (to_char - from_char) in a mapping.
- def shift(mapping):
- return mapping[1] - mapping[0]
- # Computes the stride (from_char2 - from_char1) of two mappings.
- def stride2(mapping1, mapping2):
- return mapping2[0] - mapping1[0]
- # Computes the stride of a list of mappings. The list should have at least two
- # mappings. All mappings in the list are assumed to have the same stride.
- def stride(block):
- return stride2(block[0], block[1])
- # b is a list of mappings. All the mappings are assumed to have the same
- # shift and the stride between adjecant mappings (if any) is constant.
- def dump_block(b):
- global body
- if len(b) == 1:
- # Special case for handling blocks of length 1. We don't even need to
- # emit the "if (C < X) return C" check below as all characters in this
- # range will be caught by the "C < X" check emitted by the first
- # non-trivial block.
- body += " // {2}\n if (C == {0:#06x})\n return {1:#06x};\n".format(*b[0])
- return
- first = b[0][0]
- last = first + stride(b) * (len(b)-1)
- modulo = first % stride(b)
- # All characters before this block map to themselves.
- body += " if (C < {0:#06x})\n return C;\n".format(first)
- body += " // {0} characters\n".format(len(b))
- # Generic pattern: check upper bound (lower bound is checked by the "if"
- # above) and modulo of C, return C+shift.
- pattern = " if (C <= {0:#06x} && C % {1} == {2})\n return C + {3};\n"
- if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0:
- # Special case:
- # We can elide the modulo-check because the expression "C|1" will map
- # the intervening characters to themselves.
- pattern = " if (C <= {0:#06x})\n return C | 1;\n"
- elif stride(b) == 1:
- # Another special case: X % 1 is always zero, so don't emit the
- # modulo-check.
- pattern = " if (C <= {0:#06x})\n return C + {3};\n"
- body += pattern.format(last, stride(b), modulo, shift(b[0]))
- current_block = []
- f = urlopen(sys.argv[1])
- for m in mappings(f):
- if len(current_block) == 0:
- current_block.append(m)
- continue
- if shift(current_block[0]) != shift(m):
- # Incompatible shift, start a new block.
- dump_block(current_block)
- current_block = [m]
- continue
- if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m):
- current_block.append(m)
- continue
- # Incompatible stride, start a new block.
- dump_block(current_block)
- current_block = [m]
- f.close()
- dump_block(current_block)
- print('//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//')
- print('//')
- print('// This file was generated by utils/unicode-case-fold.py from the Unicode')
- print('// case folding database at')
- print('// ', sys.argv[1])
- print('//')
- print('// To regenerate this file, run:')
- print('// utils/unicode-case-fold.py \\')
- print('// "{}" \\'.format(sys.argv[1]))
- print('// > lib/Support/UnicodeCaseFold.cpp')
- print('//')
- print('//===----------------------------------------------------------------------===//')
- print('')
- print('#include "llvm/Support/Unicode.h"')
- print('')
- print("int llvm::sys::unicode::foldCharSimple(int C) {")
- print(body)
- print(" return C;")
- print("}")
|