Skip to content

Commit 716461f

Browse files
committed
besides sequences, counting also paragraphs with no coref mentions
1 parent 8b44288 commit 716461f

1 file changed

Lines changed: 45 additions & 18 deletions

File tree

udapi/block/corefud/countgaps.py

Lines changed: 45 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from udapi.core.block import Block
2-
from collections import Counter
2+
from collections import defaultdict, Counter
33

44
class CountGaps(Block):
55
"""Block corefud.checkConsistency searches for sentence sequences with no coref annotation."""
@@ -9,15 +9,15 @@ def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=T
99
self.report_per_newdoc = report_per_newdoc
1010
self.report_per_file = report_per_file
1111
self.report_total = report_total
12-
self._total_counter = Counter()
12+
self._total_counter = defaultdict(Counter)
1313

14-
def _report_stats(self, counter=None, header_id=None):
15-
if not counter:
16-
counter = self._total_counter
14+
def _report_stats(self, counter, header_id=None):
1715
if header_id:
1816
print(f"============ {header_id} ============")
1917
for key in sorted(counter):
2018
print(f"{key:2d}: {counter[key]}")
19+
print("-------")
20+
print(f"SUM: {sum([k*counter[k] for k in counter])}")
2121

2222
def _count_empty_seqs(self, empty_seqs):
2323
counter = Counter()
@@ -26,42 +26,69 @@ def _count_empty_seqs(self, empty_seqs):
2626
return counter
2727

2828
def process_document(self, doc):
29-
file_counter = Counter()
29+
file_counters = defaultdict(Counter)
3030
empty_seqs = []
31+
empty_pars = []
3132
curr_seq = []
33+
curr_par = []
34+
is_empty_par = True
3235
newdoc = None
3336
for i, tree in enumerate(doc.trees):
3437
if tree.newdoc:
3538
if i:
3639
if curr_seq:
3740
empty_seqs.append(curr_seq)
38-
newdoc_counter = self._count_empty_seqs(empty_seqs)
39-
file_counter.update(newdoc_counter)
41+
newdoc_seq_counter = self._count_empty_seqs(empty_seqs)
42+
file_counters["seq"].update(newdoc_seq_counter)
43+
if is_empty_par:
44+
empty_pars.append(curr_par)
45+
newdoc_par_counter = self._count_empty_seqs(empty_pars)
46+
file_counters["par"].update(newdoc_par_counter)
4047
if self.report_per_newdoc:
41-
self._report_stats(newdoc_counter, header_id=newdoc)
48+
self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS in {newdoc}")
49+
self._report_stats(newdoc_par_counter, header_id=f"PAR STATS in {newdoc}")
4250
newdoc = tree.newdoc
4351
empty_seqs = []
52+
empty_pars = []
4453
curr_seq = []
54+
curr_par = []
55+
is_empty_par = True
56+
if tree.newpar:
57+
if not tree.newdoc and is_empty_par:
58+
empty_pars.append(curr_par)
59+
curr_par = []
60+
is_empty_par = True
4561

4662
has_mention = any(node.coref_mentions for node in tree.descendants)
4763
if not has_mention:
4864
curr_seq.append(tree.sent_id)
49-
elif curr_seq:
50-
empty_seqs.append(curr_seq)
51-
curr_seq = []
65+
curr_par.append(tree.sent_id)
66+
else:
67+
if curr_seq:
68+
empty_seqs.append(curr_seq)
69+
curr_seq = []
70+
is_empty_par = False
5271

5372
if curr_seq:
5473
empty_seqs.append(curr_seq)
55-
newdoc_counter = self._count_empty_seqs(empty_seqs)
56-
file_counter.update(newdoc_counter)
74+
newdoc_seq_counter = self._count_empty_seqs(empty_seqs)
75+
file_counters["seq"].update(newdoc_seq_counter)
76+
if curr_par:
77+
empty_pars.append(curr_par)
78+
newdoc_par_counter = self._count_empty_seqs(empty_pars)
79+
file_counters["par"].update(newdoc_par_counter)
5780
if self.report_per_newdoc:
58-
self._report_stats(newdoc_counter, header_id=newdoc)
81+
self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS, {newdoc}")
82+
self._report_stats(newdoc_par_counter, header_id=f"PAR STATS, {newdoc}")
5983

6084
if self.report_per_file:
61-
self._report_stats(file_counter, header_id="FULL DOC")
85+
self._report_stats(file_counters["seq"], header_id="SEQ STATS, FILE")
86+
self._report_stats(file_counters["par"], header_id="PAR STATS, FILE")
6287

63-
self._total_counter.update(file_counter)
88+
self._total_counter["seq"].update(file_counters["seq"])
89+
self._total_counter["par"].update(file_counters["par"])
6490

6591
def process_end(self):
6692
if self.report_total:
67-
self._report_stats(header_id="TOTAL")
93+
self._report_stats(self._total_counter["seq"], header_id="SEQ STATS, TOTAL")
94+
self._report_stats(self._total_counter["par"], header_id="PAR STATS, TOTAL")

0 commit comments

Comments
 (0)