@@ -40,15 +40,19 @@ mod unicodedata {
4040 builtins:: { PyModule , PyStrRef } ,
4141 function:: OptionalArg ,
4242 } ;
43+
44+ use icu_normalizer:: { ComposingNormalizerBorrowed , DecomposingNormalizerBorrowed } ;
45+ use icu_properties:: {
46+ CodePointSetData ,
47+ props:: {
48+ BidiClass , BidiMirrored , CanonicalCombiningClass , EastAsianWidth , EnumeratedProperty ,
49+ GeneralCategory , NamedEnumeratedProperty ,
50+ } ,
51+ } ;
4352 use itertools:: Itertools ;
4453 use rustpython_common:: wtf8:: { CodePoint , Wtf8Buf } ;
45- use ucd:: { Codepoint , DecompositionType , EastAsianWidth , Number , NumericType } ;
46- use unic_char_property:: EnumeratedCharProperty ;
47- use unic_normal:: StrNormalForm ;
54+ use ucd:: { Codepoint , DecompositionType , Number , NumericType } ;
4855 use unic_ucd_age:: { Age , UNICODE_VERSION , UnicodeVersion } ;
49- use unic_ucd_bidi:: BidiClass ;
50- use unic_ucd_category:: GeneralCategory ;
51- use unicode_bidi_mirroring:: is_mirroring;
5256
5357 pub ( crate ) fn module_exec ( vm : & VirtualMachine , module : & Py < PyModule > ) -> PyResult < ( ) > {
5458 __module_exec ( vm, module) ;
@@ -117,9 +121,9 @@ mod unicodedata {
117121 . extract_char ( character, vm) ?
118122 . map_or ( GeneralCategory :: Unassigned , |c| {
119123 c. to_char ( )
120- . map_or ( GeneralCategory :: Surrogate , GeneralCategory :: of )
124+ . map_or ( GeneralCategory :: Surrogate , GeneralCategory :: for_char )
121125 } )
122- . abbr_name ( )
126+ . short_name ( )
123127 . to_owned ( ) )
124128 }
125129
@@ -165,8 +169,8 @@ mod unicodedata {
165169 let bidi = match self . extract_char ( character, vm) ? {
166170 Some ( c) => c
167171 . to_char ( )
168- . map_or ( BidiClass :: LeftToRight , BidiClass :: of )
169- . abbr_name ( ) ,
172+ . map_or ( BidiClass :: LeftToRight , BidiClass :: for_char )
173+ . short_name ( ) ,
170174 None => "" ,
171175 } ;
172176 Ok ( bidi)
@@ -182,18 +186,34 @@ mod unicodedata {
182186 Ok ( self
183187 . extract_char ( character, vm) ?
184188 . and_then ( |c| c. to_char ( ) )
185- . map_or ( EastAsianWidth :: Neutral , |c| c . east_asian_width ( ) )
186- . abbr_name ( ) )
189+ . map_or ( EastAsianWidth :: Neutral , EastAsianWidth :: for_char )
190+ . short_name ( ) )
187191 }
188192
189193 #[ pymethod]
190194 fn normalize ( & self , form : super :: NormalizeForm , unistr : PyStrRef ) -> PyResult < Wtf8Buf > {
191195 let text = unistr. as_wtf8 ( ) ;
192196 let normalized_text = match form {
193- Nfc => text. map_utf8 ( |s| s. nfc ( ) ) . collect ( ) ,
194- Nfkc => text. map_utf8 ( |s| s. nfkc ( ) ) . collect ( ) ,
195- Nfd => text. map_utf8 ( |s| s. nfd ( ) ) . collect ( ) ,
196- Nfkd => text. map_utf8 ( |s| s. nfkd ( ) ) . collect ( ) ,
197+ Nfc => {
198+ let normalizer = ComposingNormalizerBorrowed :: new_nfc ( ) ;
199+ text. map_utf8 ( |s| normalizer. normalize_iter ( s. chars ( ) ) )
200+ . collect ( )
201+ }
202+ Nfkc => {
203+ let normalizer = ComposingNormalizerBorrowed :: new_nfkc ( ) ;
204+ text. map_utf8 ( |s| normalizer. normalize_iter ( s. chars ( ) ) )
205+ . collect ( )
206+ }
207+ Nfd => {
208+ let normalizer = DecomposingNormalizerBorrowed :: new_nfd ( ) ;
209+ text. map_utf8 ( |s| normalizer. normalize_iter ( s. chars ( ) ) )
210+ . collect ( )
211+ }
212+ Nfkd => {
213+ let normalizer = DecomposingNormalizerBorrowed :: new_nfkd ( ) ;
214+ text. map_utf8 ( |s| normalizer. normalize_iter ( s. chars ( ) ) )
215+ . collect ( )
216+ }
197217 } ;
198218 Ok ( normalized_text)
199219 }
@@ -202,10 +222,26 @@ mod unicodedata {
202222 fn is_normalized ( & self , form : super :: NormalizeForm , unistr : PyStrRef ) -> PyResult < bool > {
203223 let text = unistr. as_wtf8 ( ) ;
204224 let normalized: Wtf8Buf = match form {
205- Nfc => text. map_utf8 ( |s| s. nfc ( ) ) . collect ( ) ,
206- Nfkc => text. map_utf8 ( |s| s. nfkc ( ) ) . collect ( ) ,
207- Nfd => text. map_utf8 ( |s| s. nfd ( ) ) . collect ( ) ,
208- Nfkd => text. map_utf8 ( |s| s. nfkd ( ) ) . collect ( ) ,
225+ Nfc => {
226+ let normalizer = ComposingNormalizerBorrowed :: new_nfc ( ) ;
227+ text. map_utf8 ( |s| normalizer. normalize_iter ( s. chars ( ) ) )
228+ . collect ( )
229+ }
230+ Nfkc => {
231+ let normalizer = ComposingNormalizerBorrowed :: new_nfkc ( ) ;
232+ text. map_utf8 ( |s| normalizer. normalize_iter ( s. chars ( ) ) )
233+ . collect ( )
234+ }
235+ Nfd => {
236+ let normalizer = DecomposingNormalizerBorrowed :: new_nfd ( ) ;
237+ text. map_utf8 ( |s| normalizer. normalize_iter ( s. chars ( ) ) )
238+ . collect ( )
239+ }
240+ Nfkd => {
241+ let normalizer = DecomposingNormalizerBorrowed :: new_nfkd ( ) ;
242+ text. map_utf8 ( |s| normalizer. normalize_iter ( s. chars ( ) ) )
243+ . collect ( )
244+ }
209245 } ;
210246 Ok ( text == & * normalized)
211247 }
@@ -216,7 +252,8 @@ mod unicodedata {
216252 Some ( c) => {
217253 if let Some ( ch) = c. to_char ( ) {
218254 // Check if the character is mirrored in bidirectional text using Unicode standard
219- Ok ( if is_mirroring ( ch) { 1 } else { 0 } )
255+ let bidi_mirrored = CodePointSetData :: new :: < BidiMirrored > ( ) ;
256+ Ok ( if bidi_mirrored. contains ( ch) { 1 } else { 0 } )
220257 } else {
221258 Ok ( 0 )
222259 }
@@ -226,11 +263,13 @@ mod unicodedata {
226263 }
227264
228265 #[ pymethod]
229- fn combining ( & self , character : PyStrRef , vm : & VirtualMachine ) -> PyResult < i32 > {
266+ fn combining ( & self , character : PyStrRef , vm : & VirtualMachine ) -> PyResult < u8 > {
230267 Ok ( self
231268 . extract_char ( character, vm) ?
232269 . and_then ( |c| c. to_char ( ) )
233- . map_or ( 0 , |ch| ch. canonical_combining_class ( ) as i32 ) )
270+ . map_or ( 0 , |ch| {
271+ CanonicalCombiningClass :: for_char ( ch) . to_icu4c_value ( )
272+ } ) )
234273 }
235274
236275 #[ pymethod]
@@ -339,23 +378,6 @@ mod unicodedata {
339378 }
340379 }
341380
342- trait EastAsianWidthAbbrName {
343- fn abbr_name ( & self ) -> & ' static str ;
344- }
345-
346- impl EastAsianWidthAbbrName for EastAsianWidth {
347- fn abbr_name ( & self ) -> & ' static str {
348- match self {
349- Self :: Narrow => "Na" ,
350- Self :: Wide => "W" ,
351- Self :: Neutral => "N" ,
352- Self :: Ambiguous => "A" ,
353- Self :: FullWidth => "F" ,
354- Self :: HalfWidth => "H" ,
355- }
356- }
357- }
358-
359381 #[ pyattr]
360382 fn ucd_3_2_0 ( vm : & VirtualMachine ) -> PyRef < Ucd > {
361383 Ucd {
0 commit comments