Skip to content

Commit 8b44288

Browse files
committed
CorefUD: counting sentence sequences with no coref annotation
1 parent 16c3a48 commit 8b44288

1 file changed

Lines changed: 67 additions & 0 deletions

File tree

udapi/block/corefud/countgaps.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from udapi.core.block import Block
2+
from collections import Counter
3+
4+
class CountGaps(Block):
5+
"""Block corefud.checkConsistency searches for sentence sequences with no coref annotation."""
6+
7+
def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs):
8+
super().__init__(**kwargs)
9+
self.report_per_newdoc = report_per_newdoc
10+
self.report_per_file = report_per_file
11+
self.report_total = report_total
12+
self._total_counter = Counter()
13+
14+
def _report_stats(self, counter=None, header_id=None):
15+
if not counter:
16+
counter = self._total_counter
17+
if header_id:
18+
print(f"============ {header_id} ============")
19+
for key in sorted(counter):
20+
print(f"{key:2d}: {counter[key]}")
21+
22+
def _count_empty_seqs(self, empty_seqs):
23+
counter = Counter()
24+
for seq in empty_seqs:
25+
counter[len(seq)] += 1
26+
return counter
27+
28+
def process_document(self, doc):
29+
file_counter = Counter()
30+
empty_seqs = []
31+
curr_seq = []
32+
newdoc = None
33+
for i, tree in enumerate(doc.trees):
34+
if tree.newdoc:
35+
if i:
36+
if curr_seq:
37+
empty_seqs.append(curr_seq)
38+
newdoc_counter = self._count_empty_seqs(empty_seqs)
39+
file_counter.update(newdoc_counter)
40+
if self.report_per_newdoc:
41+
self._report_stats(newdoc_counter, header_id=newdoc)
42+
newdoc = tree.newdoc
43+
empty_seqs = []
44+
curr_seq = []
45+
46+
has_mention = any(node.coref_mentions for node in tree.descendants)
47+
if not has_mention:
48+
curr_seq.append(tree.sent_id)
49+
elif curr_seq:
50+
empty_seqs.append(curr_seq)
51+
curr_seq = []
52+
53+
if curr_seq:
54+
empty_seqs.append(curr_seq)
55+
newdoc_counter = self._count_empty_seqs(empty_seqs)
56+
file_counter.update(newdoc_counter)
57+
if self.report_per_newdoc:
58+
self._report_stats(newdoc_counter, header_id=newdoc)
59+
60+
if self.report_per_file:
61+
self._report_stats(file_counter, header_id="FULL DOC")
62+
63+
self._total_counter.update(file_counter)
64+
65+
def process_end(self):
66+
if self.report_total:
67+
self._report_stats(header_id="TOTAL")

0 commit comments

Comments
 (0)