Skip to content

Commit 7884281

Browse files
authored
Merge pull request ppannuto#46 from acabal/master
Add support for titlecasing non-ASCII letters
2 parents 818f138 + 4aa3ad4 commit 7884281

File tree

6 files changed

+41
-20
lines changed

6 files changed

+41
-20
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@ python:
88
- "3.6"
99
- "3.7"
1010

11-
install: pip install tox-travis coveralls nose
11+
install: pip install tox-travis regex coveralls nose
1212
script: tox
1313

README.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,18 @@ Titlecase also provides a command line utility ``titlecase``:
7272
# Or read/write files:
7373
$ titlecase -f infile -o outfile
7474
75+
76+
Limitations
77+
-----------
78+
79+
This is a best-effort library that uses regexes to try to do intelligent
80+
things, but will have limitations. For example, it does not have the contextual
81+
awareness to distinguish acronyms from words: us (we) versus US (United States).
82+
83+
The regexes and titlecasing rules were written for American English. While
84+
there is basic support for Unicode characters, such that something like
85+
"El Niño" will work, it is likely that accents or non-English phrases will
86+
not be handled correctly.
87+
88+
If anyone has concrete solutions to improve these or other shortcomings of the
89+
libraries, pull requests a very welcome!

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ def readme():
3838
packages=find_packages(),
3939
include_package_data=True,
4040
zip_safe=False,
41-
tests_require=['nose'],
42-
setup_requires=['nose>=1.0'],
41+
tests_require=['nose', 'regex'],
42+
setup_requires=['nose>=1.0', 'regex>=2020.4.4'],
4343
test_suite="titlecase.tests",
4444
entry_points = {
4545
'console_scripts': [

titlecase/__init__.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,25 +10,26 @@
1010
from __future__ import unicode_literals
1111

1212
import argparse
13-
import re
1413
import sys
1514

15+
import regex
16+
1617
__all__ = ['titlecase']
1718
__version__ = '0.12.0'
1819

1920
SMALL = r'a|an|and|as|at|but|by|en|for|if|in|of|on|or|the|to|v\.?|via|vs\.?'
2021
PUNCT = r"""!"“#$%&'‘()*+,\-–‒—―./:;?@[\\\]_`{|}~"""
2122

22-
SMALL_WORDS = re.compile(r'^(%s)$' % SMALL, re.I)
23-
INLINE_PERIOD = re.compile(r'[a-z][.][a-z]', re.I)
24-
UC_ELSEWHERE = re.compile(r'[%s]*?[a-zA-Z]+[A-Z]+?' % PUNCT)
25-
CAPFIRST = re.compile(r"^[%s]*?([A-Za-z])" % PUNCT)
26-
SMALL_FIRST = re.compile(r'^([%s]*)(%s)\b' % (PUNCT, SMALL), re.I)
27-
SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), re.I)
28-
SUBPHRASE = re.compile(r'([:.;?!\-–‒—―][ ])(%s)' % SMALL)
29-
APOS_SECOND = re.compile(r"^[dol]{1}['‘]{1}[a-z]+(?:['s]{2})?$", re.I)
30-
UC_INITIALS = re.compile(r"^(?:[A-Z]{1}\.{1}|[A-Z]{1}\.{1}[A-Z]{1})+$")
31-
MAC_MC = re.compile(r"^([Mm]c|MC)(\w.+)")
23+
SMALL_WORDS = regex.compile(r'^(%s)$' % SMALL, regex.I)
24+
INLINE_PERIOD = regex.compile(r'[\p{Letter}][.][\p{Letter}]', regex.I)
25+
UC_ELSEWHERE = regex.compile(r'[%s]*?[\p{Letter}]+[\p{Uppercase_Letter}]+?' % PUNCT)
26+
CAPFIRST = regex.compile(r"^[%s]*?([\p{Letter}])" % PUNCT)
27+
SMALL_FIRST = regex.compile(r'^([%s]*)(%s)\b' % (PUNCT, SMALL), regex.I)
28+
SMALL_LAST = regex.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), regex.I)
29+
SUBPHRASE = regex.compile(r'([:.;?!\-–‒—―][ ])(%s)' % SMALL)
30+
APOS_SECOND = regex.compile(r"^[dol]{1}['‘]{1}[\p{Letter}]+(?:['s]{2})?$", regex.I)
31+
UC_INITIALS = regex.compile(r"^(?:[\p{Uppercase_Letter}]{1}\.{1}|[\p{Uppercase_Letter}]{1}\.{1}[\p{Uppercase_Letter}]{1})+$")
32+
MAC_MC = regex.compile(r"^([Mm]c|MC)(\w.+)")
3233

3334

3435
class Immutable(object):
@@ -57,10 +58,10 @@ def set_small_word_list(small=SMALL):
5758
global SMALL_FIRST
5859
global SMALL_LAST
5960
global SUBPHRASE
60-
SMALL_WORDS = re.compile(r'^(%s)$' % small, re.I)
61-
SMALL_FIRST = re.compile(r'^([%s]*)(%s)\b' % (PUNCT, small), re.I)
62-
SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (small, PUNCT), re.I)
63-
SUBPHRASE = re.compile(r'([:.;?!][ ])(%s)' % small)
61+
SMALL_WORDS = regex.compile(r'^(%s)$' % small, regex.I)
62+
SMALL_FIRST = regex.compile(r'^([%s]*)(%s)\b' % (PUNCT, small), regex.I)
63+
SMALL_LAST = regex.compile(r'\b(%s)[%s]?$' % (small, PUNCT), regex.I)
64+
SUBPHRASE = regex.compile(r'([:.;?!][ ])(%s)' % small)
6465

6566

6667
def titlecase(text, callback=None, small_first_last=True):
@@ -75,11 +76,11 @@ def titlecase(text, callback=None, small_first_last=True):
7576
7677
"""
7778

78-
lines = re.split('[\r\n]+', text)
79+
lines = regex.split('[\r\n]+', text)
7980
processed = []
8081
for line in lines:
8182
all_caps = line.upper() == line
82-
words = re.split('[\t ]', line)
83+
words = regex.split('[\t ]', line)
8384
tc_line = []
8485
for word in words:
8586
if callback:

titlecase/tests.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,10 @@
264264
"“YOUNG AND RESTLESS”",
265265
"“Young and Restless”",
266266
),
267+
(
268+
"EL NIÑO A ARRIVÉ HIER",
269+
"El Niño a Arrivé Hier",
270+
),
267271
)
268272

269273

tox.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ envlist = py26, py27, py33, py34, py35
1212
passenv = TRAVIS TRAVIS_JOB_ID TRAVIS_BRANCH
1313
deps =
1414
nose
15+
regex
1516
coveralls
1617
commands =
1718
coverage run --source=titlecase setup.py nosetests

0 commit comments

Comments
 (0)