|
| 1 | +from udapi.core.block import Block |
| 2 | +from collections import Counter |
| 3 | + |
| 4 | +class CountGaps(Block): |
| 5 | + """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" |
| 6 | + |
| 7 | + def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs): |
| 8 | + super().__init__(**kwargs) |
| 9 | + self.report_per_newdoc = report_per_newdoc |
| 10 | + self.report_per_file = report_per_file |
| 11 | + self.report_total = report_total |
| 12 | + self._total_counter = Counter() |
| 13 | + |
| 14 | + def _report_stats(self, counter=None, header_id=None): |
| 15 | + if not counter: |
| 16 | + counter = self._total_counter |
| 17 | + if header_id: |
| 18 | + print(f"============ {header_id} ============") |
| 19 | + for key in sorted(counter): |
| 20 | + print(f"{key:2d}: {counter[key]}") |
| 21 | + |
| 22 | + def _count_empty_seqs(self, empty_seqs): |
| 23 | + counter = Counter() |
| 24 | + for seq in empty_seqs: |
| 25 | + counter[len(seq)] += 1 |
| 26 | + return counter |
| 27 | + |
| 28 | + def process_document(self, doc): |
| 29 | + file_counter = Counter() |
| 30 | + empty_seqs = [] |
| 31 | + curr_seq = [] |
| 32 | + newdoc = None |
| 33 | + for i, tree in enumerate(doc.trees): |
| 34 | + if tree.newdoc: |
| 35 | + if i: |
| 36 | + if curr_seq: |
| 37 | + empty_seqs.append(curr_seq) |
| 38 | + newdoc_counter = self._count_empty_seqs(empty_seqs) |
| 39 | + file_counter.update(newdoc_counter) |
| 40 | + if self.report_per_newdoc: |
| 41 | + self._report_stats(newdoc_counter, header_id=newdoc) |
| 42 | + newdoc = tree.newdoc |
| 43 | + empty_seqs = [] |
| 44 | + curr_seq = [] |
| 45 | + |
| 46 | + has_mention = any(node.coref_mentions for node in tree.descendants) |
| 47 | + if not has_mention: |
| 48 | + curr_seq.append(tree.sent_id) |
| 49 | + elif curr_seq: |
| 50 | + empty_seqs.append(curr_seq) |
| 51 | + curr_seq = [] |
| 52 | + |
| 53 | + if curr_seq: |
| 54 | + empty_seqs.append(curr_seq) |
| 55 | + newdoc_counter = self._count_empty_seqs(empty_seqs) |
| 56 | + file_counter.update(newdoc_counter) |
| 57 | + if self.report_per_newdoc: |
| 58 | + self._report_stats(newdoc_counter, header_id=newdoc) |
| 59 | + |
| 60 | + if self.report_per_file: |
| 61 | + self._report_stats(file_counter, header_id="FULL DOC") |
| 62 | + |
| 63 | + self._total_counter.update(file_counter) |
| 64 | + |
| 65 | + def process_end(self): |
| 66 | + if self.report_total: |
| 67 | + self._report_stats(header_id="TOTAL") |
0 commit comments