From c5706c985ffe5f0bc4e8b9f23deee54da3d002c5 Mon Sep 17 00:00:00 2001 From: Abdullah Diab Date: Fri, 20 Jan 2023 14:03:32 +0000 Subject: [PATCH] Testing ligatures fix --- arabic_reshaper/arabic_reshaper.py | 126 ++++++++++++-------- arabic_reshaper/tests/test_004_reshaping.py | 36 ++++++ setup.py | 1 + 3 files changed, 110 insertions(+), 53 deletions(-) create mode 100644 arabic_reshaper/tests/test_004_reshaping.py diff --git a/arabic_reshaper/arabic_reshaper.py b/arabic_reshaper/arabic_reshaper.py index 52c6d01..a8c12e1 100644 --- a/arabic_reshaper/arabic_reshaper.py +++ b/arabic_reshaper/arabic_reshaper.py @@ -7,8 +7,9 @@ # Email: mpcabd@gmail.com # Website: http://mpcabd.xyz -import re +import regex as re +from collections import defaultdict from itertools import repeat from .ligatures import LIGATURES @@ -69,30 +70,37 @@ def __init__(self, configuration=None, configuration_file=None): self.letters = LETTERS_ARABIC @property - def _ligatures_re(self): - if not hasattr(self, '__ligatures_re'): - patterns = [] - re_group_index_to_ligature_forms = {} - index = 0 + def _ligature_patterns_by_length(self): + if not hasattr(self, '__ligature_patterns_by_length'): + patterns_by_length = defaultdict(list) + index_to_forms_by_length = defaultdict(dict) + index_by_length = defaultdict(lambda: 0) FORMS = 1 MATCH = 0 for ligature_record in LIGATURES: ligature, replacement = ligature_record if not self.configuration.getboolean(ligature): continue - re_group_index_to_ligature_forms[index] = replacement[FORMS] - patterns.append('({})'.format(replacement[MATCH])) - index += 1 - self._re_group_index_to_ligature_forms = ( - re_group_index_to_ligature_forms - ) - self.__ligatures_re = re.compile('|'.join(patterns), re.UNICODE) - return self.__ligatures_re - - def _get_ligature_forms_from_re_group_index(self, group_index): - if not hasattr(self, '_re_group_index_to_ligature_forms'): - return self._ligatures_re - return self._re_group_index_to_ligature_forms[group_index] + length = len(replacement[MATCH]) + index_to_forms_by_length[length][index_by_length[length]] = replacement[FORMS] + patterns_by_length[length].append('({})'.format(replacement[MATCH])) + index_by_length[length] += 1 + self._index_to_forms_by_length = index_to_forms_by_length + for l, patterns in patterns_by_length.items(): + patterns_by_length[l] = re.compile('|'.join(patterns), re.UNICODE) + self.__ligature_patterns_by_length = patterns_by_length + return self.__ligature_patterns_by_length + + @property + def _ligatures_lengths_sorted(self): + if not hasattr(self, '__ligatures_lengths_sorted'): + self.__ligatures_lengths_sorted = tuple(sorted(self._ligature_patterns_by_length.keys(), reverse=True)) + return self.__ligatures_lengths_sorted + + def _get_ligature_forms_from_re_group_index(self, match_length, group_index): + if not hasattr(self, '_index_to_forms_by_length'): + t = self._ligature_patterns_by_length + return self._index_to_forms_by_length[match_length][group_index] def reshape(self, text): if not text: @@ -183,41 +191,53 @@ def reshape(self, text): if delete_tatweel: text = text.replace(TATWEEL, '') - for match in re.finditer(self._ligatures_re, text): - group_index = next(( - i for i, group in enumerate(match.groups()) if group - ), -1) - forms = self._get_ligature_forms_from_re_group_index( - group_index - ) - a, b = match.span() - a_form = output[a][FORM] - b_form = output[b - 1][FORM] - ligature_form = None - - # +-----------+----------+---------+---------+----------+ - # | a \ b | ISOLATED | INITIAL | MEDIAL | FINAL | - # +-----------+----------+---------+---------+----------+ - # | ISOLATED | ISOLATED | INITIAL | INITIAL | ISOLATED | - # | INITIAL | ISOLATED | INITIAL | INITIAL | ISOLATED | - # | MEDIAL | FINAL | MEDIAL | MEDIAL | FINAL | - # | FINAL | FINAL | MEDIAL | MEDIAL | FINAL | - # +-----------+----------+---------+---------+----------+ - - if a_form in (isolated_form, INITIAL): - if b_form in (isolated_form, FINAL): - ligature_form = ISOLATED - else: - ligature_form = INITIAL - else: - if b_form in (isolated_form, FINAL): - ligature_form = FINAL + ligatures_indices = set() + + for match_length in self._ligatures_lengths_sorted: + ligatures_re = self._ligature_patterns_by_length[match_length] + + for match in re.finditer(ligatures_re, text, overlapped=True): + group_index = next(( + i for i, group in enumerate(match.groups()) if group + ), -1) + forms = self._get_ligature_forms_from_re_group_index( + match_length, + group_index + ) + a, b = match.span() + + # Skip letters that have been already used in ligatures + if next(filter(lambda i: i in ligatures_indices, range(a, b)), None): + continue + + a_form = output[a][FORM] + b_form = output[b - 1][FORM] + ligature_form = None + + # +-----------+----------+---------+---------+----------+ + # | a \ b | ISOLATED | INITIAL | MEDIAL | FINAL | + # +-----------+----------+---------+---------+----------+ + # | ISOLATED | ISOLATED | INITIAL | INITIAL | ISOLATED | + # | INITIAL | ISOLATED | INITIAL | INITIAL | ISOLATED | + # | MEDIAL | FINAL | MEDIAL | MEDIAL | FINAL | + # | FINAL | FINAL | MEDIAL | MEDIAL | FINAL | + # +-----------+----------+---------+---------+----------+ + + if a_form in (isolated_form, INITIAL): + if b_form in (isolated_form, FINAL): + ligature_form = ISOLATED + else: + ligature_form = INITIAL else: - ligature_form = MEDIAL - if not forms[ligature_form]: - continue - output[a] = (forms[ligature_form], NOT_SUPPORTED) - output[a+1:b] = repeat(('', NOT_SUPPORTED), b - 1 - a) + if b_form in (isolated_form, FINAL): + ligature_form = FINAL + else: + ligature_form = MEDIAL + if not forms[ligature_form]: + continue + output[a] = (forms[ligature_form], NOT_SUPPORTED) + output[a+1:b] = repeat(('', NOT_SUPPORTED), b - 1 - a) + ligatures_indices.update(range(a, b)) result = [] if not delete_harakat and -1 in positions_harakat: diff --git a/arabic_reshaper/tests/test_004_reshaping.py b/arabic_reshaper/tests/test_004_reshaping.py new file mode 100644 index 0000000..d23a2b2 --- /dev/null +++ b/arabic_reshaper/tests/test_004_reshaping.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +import arabic_reshaper +import arabic_reshaper.ligatures as ligatures +import itertools +import sys +import unittest + + +def _reshaping_test(test): + for i, case in enumerate(test.cases): + def t(): test.assertEqual(case[1], test.reshaper.reshape(case[0])) + if hasattr(test, 'subTest'): + with test.subTest(i=i, case=case[0]): + t() + else: + print('running test case %d' % i, file=sys.stderr) + t() + + +class TestDefaultReshaping(unittest.TestCase): + def setUp(self): + config = {} + for l in itertools.chain(ligatures.SENTENCES_LIGATURES, ligatures.WORDS_LIGATURES, ligatures.LETTERS_LIGATURES): + config[l[0]] = True + self.reshaper = arabic_reshaper.ArabicReshaper(config) + self.cases = ( + ('\u0645\u064A\u0646','\uFEE3\uFC94'), + ) + + def test_reshaping(self): + _reshaping_test(self) + + +if __name__ == '__main__': + unittest.main() diff --git a/setup.py b/setup.py index df40987..9ffd628 100755 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ platforms='ALL', license='MIT', packages=['arabic_reshaper'], + install_requires=['regex'], extras_require={ 'with-fonttools': ['fonttools>=4.0'] },