Merge pull request ppannuto#46 from acabal/master

ppannuto · web-flow · commit 788428177033 · 2020-04-22T08:54:49.000-04:00
Add support for titlecasing non-ASCII letters
diff --git a/.travis.yml b/.travis.yml
@@ -8,6 +8,6 @@ python:
   - "3.6"
   - "3.7"
 
-install: pip install tox-travis coveralls nose
+install: pip install tox-travis regex coveralls nose
 script: tox
 
diff --git a/README.rst b/README.rst
@@ -72,3 +72,18 @@ Titlecase also provides a command line utility ``titlecase``:
     # Or read/write files:
     $ titlecase -f infile -o outfile
 
+
+Limitations
+-----------
+
+This is a best-effort library that uses regexes to try to do intelligent
+things, but will have limitations. For example, it does not have the contextual
+awareness to distinguish acronyms from words: us (we) versus US (United States).
+
+The regexes and titlecasing rules were written for American English. While
+there is basic support for Unicode characters, such that something like
+"El Niño" will work, it is likely that accents or non-English phrases will
+not be handled correctly.
+
+If anyone has concrete solutions to improve these or other shortcomings of the
+libraries, pull requests a very welcome!
diff --git a/setup.py b/setup.py
@@ -38,8 +38,8 @@ def readme():
     packages=find_packages(),
     include_package_data=True,
     zip_safe=False,
-    tests_require=['nose'],
-    setup_requires=['nose>=1.0'],
+    tests_require=['nose', 'regex'],
+    setup_requires=['nose>=1.0', 'regex>=2020.4.4'],
     test_suite="titlecase.tests",
     entry_points = {
         'console_scripts': [
diff --git a/titlecase/__init__.py b/titlecase/__init__.py
@@ -10,25 +10,26 @@
 from __future__ import unicode_literals
 
 import argparse
-import re
 import sys
 
+import regex
+
 __all__ = ['titlecase']
 __version__ = '0.12.0'
 
 SMALL = r'a|an|and|as|at|but|by|en|for|if|in|of|on|or|the|to|v\.?|via|vs\.?'
 PUNCT = r"""!"“#$%&'‘()*+,\-–‒—―./:;?@[\\\]_`{|}~"""
 
-SMALL_WORDS = re.compile(r'^(%s)$' % SMALL, re.I)
-INLINE_PERIOD = re.compile(r'[a-z][.][a-z]', re.I)
-UC_ELSEWHERE = re.compile(r'[%s]*?[a-zA-Z]+[A-Z]+?' % PUNCT)
-CAPFIRST = re.compile(r"^[%s]*?([A-Za-z])" % PUNCT)
-SMALL_FIRST = re.compile(r'^([%s]*)(%s)\b' % (PUNCT, SMALL), re.I)
-SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), re.I)
-SUBPHRASE = re.compile(r'([:.;?!\-–‒—―][ ])(%s)' % SMALL)
-APOS_SECOND = re.compile(r"^[dol]{1}['‘]{1}[a-z]+(?:['s]{2})?$", re.I)
-UC_INITIALS = re.compile(r"^(?:[A-Z]{1}\.{1}|[A-Z]{1}\.{1}[A-Z]{1})+$")
-MAC_MC = re.compile(r"^([Mm]c|MC)(\w.+)")
+SMALL_WORDS = regex.compile(r'^(%s)$' % SMALL, regex.I)
+INLINE_PERIOD = regex.compile(r'[\p{Letter}][.][\p{Letter}]', regex.I)
+UC_ELSEWHERE = regex.compile(r'[%s]*?[\p{Letter}]+[\p{Uppercase_Letter}]+?' % PUNCT)
+CAPFIRST = regex.compile(r"^[%s]*?([\p{Letter}])" % PUNCT)
+SMALL_FIRST = regex.compile(r'^([%s]*)(%s)\b' % (PUNCT, SMALL), regex.I)
+SMALL_LAST = regex.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), regex.I)
+SUBPHRASE = regex.compile(r'([:.;?!\-–‒—―][ ])(%s)' % SMALL)
+APOS_SECOND = regex.compile(r"^[dol]{1}['‘]{1}[\p{Letter}]+(?:['s]{2})?$", regex.I)
+UC_INITIALS = regex.compile(r"^(?:[\p{Uppercase_Letter}]{1}\.{1}|[\p{Uppercase_Letter}]{1}\.{1}[\p{Uppercase_Letter}]{1})+$")
+MAC_MC = regex.compile(r"^([Mm]c|MC)(\w.+)")
 
 
 class Immutable(object):
@@ -57,10 +58,10 @@ def set_small_word_list(small=SMALL):
     global SMALL_FIRST
     global SMALL_LAST
     global SUBPHRASE
-    SMALL_WORDS = re.compile(r'^(%s)$' % small, re.I)
-    SMALL_FIRST = re.compile(r'^([%s]*)(%s)\b' % (PUNCT, small), re.I)
-    SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (small, PUNCT), re.I)
-    SUBPHRASE = re.compile(r'([:.;?!][ ])(%s)' % small)
+    SMALL_WORDS = regex.compile(r'^(%s)$' % small, regex.I)
+    SMALL_FIRST = regex.compile(r'^([%s]*)(%s)\b' % (PUNCT, small), regex.I)
+    SMALL_LAST = regex.compile(r'\b(%s)[%s]?$' % (small, PUNCT), regex.I)
+    SUBPHRASE = regex.compile(r'([:.;?!][ ])(%s)' % small)
 
 
 def titlecase(text, callback=None, small_first_last=True):
@@ -75,11 +76,11 @@ def titlecase(text, callback=None, small_first_last=True):
 
     """
 
-    lines = re.split('[\r\n]+', text)
+    lines = regex.split('[\r\n]+', text)
     processed = []
     for line in lines:
         all_caps = line.upper() == line
-        words = re.split('[\t ]', line)
+        words = regex.split('[\t ]', line)
         tc_line = []
         for word in words:
             if callback:
diff --git a/titlecase/tests.py b/titlecase/tests.py
@@ -264,6 +264,10 @@
         "“YOUNG AND RESTLESS”",
         "“Young and Restless”",
     ),
+    (
+        "EL NIÑO A ARRIVÉ HIER",
+        "El Niño a Arrivé Hier",
+    ),
 )
 
 
diff --git a/tox.ini b/tox.ini
@@ -12,6 +12,7 @@ envlist = py26, py27, py33, py34, py35
 passenv = TRAVIS TRAVIS_JOB_ID TRAVIS_BRANCH
 deps =
     nose
+    regex
     coveralls
 commands =
     coverage run --source=titlecase setup.py nosetests

Original file line number	Diff line number	Diff line change
`@@ -264,6 +264,10 @@`
`264`	`264`	`"“YOUNG AND RESTLESS”",`
`265`	`265`	`"“Young and Restless”",`
`266`	`266`	`),`
	`267`	`+ (`
	`268`	`+ "EL NIÑO A ARRIVÉ HIER",`
	`269`	`+ "El Niño a Arrivé Hier",`
	`270`	`+ ),`
`267`	`271`	`)`
`268`	`272`
`269`	`273`