-
Notifications
You must be signed in to change notification settings - Fork 35
Expand file tree
/
Copy pathsimple.py
More file actions
91 lines (81 loc) · 3.3 KB
/
simple.py
File metadata and controls
91 lines (81 loc) · 3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""Block segment.Simple"""
from udapi.core.block import Block
from udapi.core.bundle import Bundle
import re
class Simple(Block):
""""Heuristic segmenter, splits on sentence-final segmentation followed by uppercase.
The exceptions are:
1) abbreviations of names, e.g. "A. Merkel"
2) predefined list of nonfinal abbreviations, e.g. "e.g."
Parameters
----------
keep_spaces : bool
do not strip whitespaces from the `text` attribute of the sentences created by segmentation
"""
def __init__(self, keep_spaces=False, **kwargs):
super().__init__(**kwargs)
self.keep_spaces = keep_spaces
@staticmethod
def is_nonfinal_abbrev(token):
"""Is a given token an abbreviation (without the final period) which cannot end a sentence?"""
if re.search('(např|e.g.)$', token):
return True
return False
def is_boundary(self, first, second):
"""Is there a sentence boundary between the first and second token?"""
if not first or not second:
return False
if first[-1] in '"“»›)':
first = first[:-1]
if not first:
return False
if second[0] in '"„«¿¡‹(':
second = second[1:]
if not second:
return False
if not second[0].isupper() or second[0].isdigit():
return False
if not first[-1] in '.!?':
return False
if first[-1] == '.':
# correctly count length in "„A. Merkel"
if first[0] in '"„«¿¡‹(':
first = first[1:]
if len(first) == 2 and first[0].isupper():
return False
if self.is_nonfinal_abbrev(first[:-1]):
return False
return True
def segment_string(self, string):
"""Return a list of sentences in a given string."""
tokens = string.split(' ')
previous = tokens[0]
segments = [previous]
for token in tokens[1:]:
if self.is_boundary(previous, token):
if self.keep_spaces:
segments[-1] += ' '
segments.append(token)
else:
segments[-1] += ' ' + token
previous = token
return segments
def process_document(self, doc):
old_bundles = doc.bundles
new_bundles = []
for bundle in old_bundles:
new_bundles.append(bundle)
for tree in bundle:
if self._should_process_tree(tree):
if tree.children:
raise ValueError("Segmenting already tokenized text is not supported.")
sentences = self.segment_string(tree.text)
orig_bundle_id = bundle.bundle_id
bundle.bundle_id = orig_bundle_id + '-1'
if len(sentences) > 1:
tree.text = sentences[0]
for i, sentence in enumerate(sentences[1:], 2):
new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i))
new_bundle.create_tree(tree.zone).text = sentence
new_bundles.append(new_bundle)
doc.bundles = new_bundles