2626# written by Fredrik Lundh (fredrik@pythonware.com)
2727#
2828
29+ import dataclasses
2930import os
3031import sys
3132import zipfile
3233
3334from functools import partial
3435from textwrap import dedent
35- from typing import Iterator , List , Tuple
36+ from typing import Iterator , List , Optional , Set , Tuple
3637
3738SCRIPT = sys .argv [0 ]
3839VERSION = "3.3"
@@ -148,12 +149,12 @@ def makeunicodedata(unicode, trace):
148149 record = unicode .table [char ]
149150 if record :
150151 # extract database properties
151- category = CATEGORY_NAMES .index (record [ 2 ] )
152- combining = int (record [ 3 ] )
153- bidirectional = BIDIRECTIONAL_NAMES .index (record [ 4 ] )
154- mirrored = record [ 9 ] == "Y"
155- eastasianwidth = EASTASIANWIDTH_NAMES .index (record [ 15 ] )
156- normalizationquickcheck = record [ 17 ]
152+ category = CATEGORY_NAMES .index (record . general_category )
153+ combining = int (record . canonical_combining_class )
154+ bidirectional = BIDIRECTIONAL_NAMES .index (record . bidi_class )
155+ mirrored = record . bidi_mirrored == "Y"
156+ eastasianwidth = EASTASIANWIDTH_NAMES .index (record . east_asian_width )
157+ normalizationquickcheck = record . quick_check
157158 item = (
158159 category , combining , bidirectional , mirrored , eastasianwidth ,
159160 normalizationquickcheck
@@ -179,8 +180,8 @@ def makeunicodedata(unicode, trace):
179180 for char in unicode .chars :
180181 record = unicode .table [char ]
181182 if record :
182- if record [ 5 ] :
183- decomp = record [ 5 ] .split ()
183+ if record . decomposition_type :
184+ decomp = record . decomposition_type .split ()
184185 if len (decomp ) > 19 :
185186 raise Exception ("character %x has a decomposition too large for nfd_nfkd" % char )
186187 # prefix
@@ -200,7 +201,7 @@ def makeunicodedata(unicode, trace):
200201 # Collect NFC pairs
201202 if not prefix and len (decomp ) == 3 and \
202203 char not in unicode .exclusions and \
203- unicode .table [decomp [1 ]][ 3 ] == "0" :
204+ unicode .table [decomp [1 ]]. canonical_combining_class == "0" :
204205 p , l , r = decomp
205206 comp_first [l ] = 1
206207 comp_last [r ] = 1
@@ -404,9 +405,9 @@ def makeunicodetype(unicode, trace):
404405 record = unicode .table [char ]
405406 if record :
406407 # extract database properties
407- category = record [ 2 ]
408- bidirectional = record [ 4 ]
409- properties = record [ 16 ]
408+ category = record . general_category
409+ bidirectional = record . bidi_class
410+ properties = record . binary_properties
410411 flags = 0
411412 if category in ["Lm" , "Lt" , "Lu" , "Ll" , "Lo" ]:
412413 flags |= ALPHA_MASK
@@ -434,16 +435,16 @@ def makeunicodetype(unicode, trace):
434435 flags |= CASE_IGNORABLE_MASK
435436 sc = unicode .special_casing .get (char )
436437 cf = unicode .case_folding .get (char , [char ])
437- if record [ 12 ] :
438- upper = int (record [ 12 ] , 16 )
438+ if record . simple_uppercase_mapping :
439+ upper = int (record . simple_uppercase_mapping , 16 )
439440 else :
440441 upper = char
441- if record [ 13 ] :
442- lower = int (record [ 13 ] , 16 )
442+ if record . simple_lowercase_mapping :
443+ lower = int (record . simple_lowercase_mapping , 16 )
443444 else :
444445 lower = char
445- if record [ 14 ] :
446- title = int (record [ 14 ] , 16 )
446+ if record . simple_titlecase_mapping :
447+ title = int (record . simple_titlecase_mapping , 16 )
447448 else :
448449 title = upper
449450 if sc is None and cf != [lower ]:
@@ -480,16 +481,16 @@ def makeunicodetype(unicode, trace):
480481 extra_casing .extend (sc [1 ])
481482 # decimal digit, integer digit
482483 decimal = 0
483- if record [ 6 ] :
484+ if record . decomposition_mapping :
484485 flags |= DECIMAL_MASK
485- decimal = int (record [ 6 ] )
486+ decimal = int (record . decomposition_mapping )
486487 digit = 0
487- if record [ 7 ] :
488+ if record . numeric_type :
488489 flags |= DIGIT_MASK
489- digit = int (record [ 7 ] )
490- if record [ 8 ] :
490+ digit = int (record . numeric_type )
491+ if record . numeric_value :
491492 flags |= NUMERIC_MASK
492- numeric .setdefault (record [ 8 ] , []).append (char )
493+ numeric .setdefault (record . numeric_value , []).append (char )
493494 item = (
494495 upper , lower , title , decimal , digit , flags
495496 )
@@ -609,7 +610,7 @@ def makeunicodename(unicode, trace):
609610 for char in unicode .chars :
610611 record = unicode .table [char ]
611612 if record :
612- name = record [ 1 ] .strip ()
613+ name = record . name .strip ()
613614 if name and name [0 ] != "<" :
614615 names [char ] = name + chr (0 )
615616
@@ -719,7 +720,7 @@ def word_key(a):
719720 for char in unicode .chars :
720721 record = unicode .table [char ]
721722 if record :
722- name = record [ 1 ] .strip ()
723+ name = record . name .strip ()
723724 if name and name [0 ] != "<" :
724725 data .append ((name , char ))
725726
@@ -819,31 +820,27 @@ def merge_old_version(version, new, old):
819820 continue
820821 # check characters that differ
821822 if old .table [i ] != new .table [i ]:
822- for k in range (len (old .table [i ])):
823- if old .table [i ][k ] != new .table [i ][k ]:
824- value = old .table [i ][k ]
823+ for k , field in enumerate (dataclasses .fields (UcdRecord )):
824+ value = getattr (old .table [i ], field .name )
825+ new_value = getattr (new .table [i ], field .name )
826+ if value != new_value :
825827 if k == 1 and i in PUA_15 :
826828 # the name is not set in the old.table, but in the
827829 # new.table we are using it for aliases and named seq
828830 assert value == ''
829831 elif k == 2 :
830- #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
831832 category_changes [i ] = CATEGORY_NAMES .index (value )
832833 elif k == 4 :
833- #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
834834 bidir_changes [i ] = BIDIRECTIONAL_NAMES .index (value )
835835 elif k == 5 :
836- #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
837836 # We assume that all normalization changes are in 1:1 mappings
838837 assert " " not in value
839838 normalization_changes .append ((i , value ))
840839 elif k == 6 :
841- #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
842840 # we only support changes where the old value is a single digit
843841 assert value in "0123456789"
844842 decimal_changes [i ] = int (value )
845843 elif k == 8 :
846- # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
847844 # Since 0 encodes "no change", the old value is better not 0
848845 if not value :
849846 numeric_changes [i ] = - 1
@@ -952,25 +949,60 @@ def expanded(self) -> Iterator[Tuple[int, List[str]]]:
952949 yield char , rest
953950
954951
952+ @dataclasses .dataclass
953+ class UcdRecord :
954+ # 15 fields from UnicodeData.txt . See:
955+ # https://www.unicode.org/reports/tr44/#UnicodeData.txt
956+ codepoint : str
957+ name : str
958+ general_category : str
959+ canonical_combining_class : str
960+ bidi_class : str
961+ decomposition_type : str
962+ decomposition_mapping : str
963+ numeric_type : str
964+ numeric_value : str
965+ bidi_mirrored : str
966+ unicode_1_name : str # obsolete
967+ iso_comment : str # obsolete
968+ simple_uppercase_mapping : str
969+ simple_lowercase_mapping : str
970+ simple_titlecase_mapping : str
971+
972+ # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
973+ east_asian_width : Optional [str ]
974+
975+ # Binary properties, as a set of those that are true.
976+ # Taken from multiple files:
977+ # https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
978+ # https://www.unicode.org/reports/tr44/#LineBreak.txt
979+ binary_properties : Set [str ]
980+
981+ # The Quick_Check properties related to normalization:
982+ # https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
983+ # We store them as a bitmask.
984+ quick_check : int
985+
986+
987+ def from_row (row : List [str ]) -> UcdRecord :
988+ return UcdRecord (* row , None , set (), 0 )
989+
990+
955991# --------------------------------------------------------------------
956992# the following support code is taken from the unidb utilities
957993# Copyright (c) 1999-2000 by Secret Labs AB
958994
959995# load a unicode-data file from disk
960996
961997class UnicodeData :
962- # Record structure:
963- # [ID, name, category, combining, bidi, decomp, (6)
964- # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
965- # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
966- # derived-props] (17)
998+ # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
967999
9681000 def __init__ (self , version , cjk_check = True ):
9691001 self .changed = []
9701002 table = [None ] * 0x110000
9711003 for s in UcdFile (UNICODE_DATA , version ):
9721004 char = int (s [0 ], 16 )
973- table [char ] = s
1005+ table [char ] = from_row ( s )
9741006
9751007 cjk_ranges_found = []
9761008
@@ -982,19 +1014,17 @@ def __init__(self, version, cjk_check=True):
9821014 # https://www.unicode.org/reports/tr44/#Code_Point_Ranges
9831015 s = table [i ]
9841016 if s :
985- if s [ 1 ] [- 6 :] == "First>" :
986- s [ 1 ] = ""
987- field = s
988- elif s [ 1 ] [- 5 :] == "Last>" :
989- if s [ 1 ] .startswith ("<CJK Ideograph" ):
1017+ if s . name [- 6 :] == "First>" :
1018+ s . name = ""
1019+ field = dataclasses . astuple ( s )[: 15 ]
1020+ elif s . name [- 5 :] == "Last>" :
1021+ if s . name .startswith ("<CJK Ideograph" ):
9901022 cjk_ranges_found .append ((field [0 ],
991- s [ 0 ] ))
992- s [ 1 ] = ""
1023+ s . codepoint ))
1024+ s . name = ""
9931025 field = None
9941026 elif field :
995- f2 = field [:]
996- f2 [0 ] = "%X" % i
997- table [i ] = f2
1027+ table [i ] = from_row (('%X' % i ,) + field [1 :])
9981028 if cjk_check and cjk_ranges != cjk_ranges_found :
9991029 raise ValueError ("CJK ranges deviate: have %r" % cjk_ranges_found )
10001030
@@ -1015,7 +1045,7 @@ def __init__(self, version, cjk_check=True):
10151045 char = int (char , 16 )
10161046 self .aliases .append ((name , char ))
10171047 # also store the name in the PUA 1
1018- self .table [pua_index ][ 1 ] = name
1048+ self .table [pua_index ]. name = name
10191049 pua_index += 1
10201050 assert pua_index - NAME_ALIASES_START == len (self .aliases )
10211051
@@ -1034,7 +1064,7 @@ def __init__(self, version, cjk_check=True):
10341064 "the NamedSequence struct and in unicodedata_lookup" )
10351065 self .named_sequences .append ((name , chars ))
10361066 # also store these in the PUA 1
1037- self .table [pua_index ][ 1 ] = name
1067+ self .table [pua_index ]. name = name
10381068 pua_index += 1
10391069 assert pua_index - NAMED_SEQUENCES_START == len (self .named_sequences )
10401070
@@ -1049,23 +1079,19 @@ def __init__(self, version, cjk_check=True):
10491079
10501080 for i in range (0 , 0x110000 ):
10511081 if table [i ] is not None :
1052- table [i ].append (widths [i ])
1053-
1054- for i in range (0 , 0x110000 ):
1055- if table [i ] is not None :
1056- table [i ].append (set ())
1082+ table [i ].east_asian_width = widths [i ]
10571083
10581084 for char , (p ,) in UcdFile (DERIVED_CORE_PROPERTIES , version ).expanded ():
10591085 if table [char ]:
10601086 # Some properties (e.g. Default_Ignorable_Code_Point)
10611087 # apply to unassigned code points; ignore them
1062- table [char ][ - 1 ] .add (p )
1088+ table [char ]. binary_properties .add (p )
10631089
10641090 for char_range , value in UcdFile (LINE_BREAK , version ):
10651091 if value not in MANDATORY_LINE_BREAKS :
10661092 continue
10671093 for char in expand_range (char_range ):
1068- table [char ][ - 1 ] .add ('Line_Break' )
1094+ table [char ]. binary_properties .add ('Line_Break' )
10691095
10701096 # We only want the quickcheck properties
10711097 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
@@ -1087,7 +1113,7 @@ def __init__(self, version, cjk_check=True):
10871113 quickchecks [char ] |= quickcheck
10881114 for i in range (0 , 0x110000 ):
10891115 if table [i ] is not None :
1090- table [i ].append ( quickchecks [i ])
1116+ table [i ].quick_check = quickchecks [i ]
10911117
10921118 with open_data (UNIHAN , version ) as file :
10931119 zip = zipfile .ZipFile (file )
@@ -1106,7 +1132,7 @@ def __init__(self, version, cjk_check=True):
11061132 i = int (code [2 :], 16 )
11071133 # Patch the numeric field
11081134 if table [i ] is not None :
1109- table [i ][ 8 ] = value
1135+ table [i ]. numeric_value = value
11101136
11111137 sc = self .special_casing = {}
11121138 for data in UcdFile (SPECIAL_CASING , version ):
0 commit comments