11from udapi .core .block import Block
2- from collections import Counter
2+ from collections import defaultdict , Counter
33
44class CountGaps (Block ):
55 """Block corefud.checkConsistency searches for sentence sequences with no coref annotation."""
@@ -9,15 +9,15 @@ def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=T
99 self .report_per_newdoc = report_per_newdoc
1010 self .report_per_file = report_per_file
1111 self .report_total = report_total
12- self ._total_counter = Counter ( )
12+ self ._total_counter = defaultdict ( Counter )
1313
14- def _report_stats (self , counter = None , header_id = None ):
15- if not counter :
16- counter = self ._total_counter
14+ def _report_stats (self , counter , header_id = None ):
1715 if header_id :
1816 print (f"============ { header_id } ============" )
1917 for key in sorted (counter ):
2018 print (f"{ key :2d} : { counter [key ]} " )
19+ print ("-------" )
20+ print (f"SUM: { sum ([k * counter [k ] for k in counter ])} " )
2121
2222 def _count_empty_seqs (self , empty_seqs ):
2323 counter = Counter ()
@@ -26,42 +26,69 @@ def _count_empty_seqs(self, empty_seqs):
2626 return counter
2727
2828 def process_document (self , doc ):
29- file_counter = Counter ( )
29+ file_counters = defaultdict ( Counter )
3030 empty_seqs = []
31+ empty_pars = []
3132 curr_seq = []
33+ curr_par = []
34+ is_empty_par = True
3235 newdoc = None
3336 for i , tree in enumerate (doc .trees ):
3437 if tree .newdoc :
3538 if i :
3639 if curr_seq :
3740 empty_seqs .append (curr_seq )
38- newdoc_counter = self ._count_empty_seqs (empty_seqs )
39- file_counter .update (newdoc_counter )
41+ newdoc_seq_counter = self ._count_empty_seqs (empty_seqs )
42+ file_counters ["seq" ].update (newdoc_seq_counter )
43+ if is_empty_par :
44+ empty_pars .append (curr_par )
45+ newdoc_par_counter = self ._count_empty_seqs (empty_pars )
46+ file_counters ["par" ].update (newdoc_par_counter )
4047 if self .report_per_newdoc :
41- self ._report_stats (newdoc_counter , header_id = newdoc )
48+ self ._report_stats (newdoc_seq_counter , header_id = f"SEQ STATS in { newdoc } " )
49+ self ._report_stats (newdoc_par_counter , header_id = f"PAR STATS in { newdoc } " )
4250 newdoc = tree .newdoc
4351 empty_seqs = []
52+ empty_pars = []
4453 curr_seq = []
54+ curr_par = []
55+ is_empty_par = True
56+ if tree .newpar :
57+ if not tree .newdoc and is_empty_par :
58+ empty_pars .append (curr_par )
59+ curr_par = []
60+ is_empty_par = True
4561
4662 has_mention = any (node .coref_mentions for node in tree .descendants )
4763 if not has_mention :
4864 curr_seq .append (tree .sent_id )
49- elif curr_seq :
50- empty_seqs .append (curr_seq )
51- curr_seq = []
65+ curr_par .append (tree .sent_id )
66+ else :
67+ if curr_seq :
68+ empty_seqs .append (curr_seq )
69+ curr_seq = []
70+ is_empty_par = False
5271
5372 if curr_seq :
5473 empty_seqs .append (curr_seq )
55- newdoc_counter = self ._count_empty_seqs (empty_seqs )
56- file_counter .update (newdoc_counter )
74+ newdoc_seq_counter = self ._count_empty_seqs (empty_seqs )
75+ file_counters ["seq" ].update (newdoc_seq_counter )
76+ if curr_par :
77+ empty_pars .append (curr_par )
78+ newdoc_par_counter = self ._count_empty_seqs (empty_pars )
79+ file_counters ["par" ].update (newdoc_par_counter )
5780 if self .report_per_newdoc :
58- self ._report_stats (newdoc_counter , header_id = newdoc )
81+ self ._report_stats (newdoc_seq_counter , header_id = f"SEQ STATS, { newdoc } " )
82+ self ._report_stats (newdoc_par_counter , header_id = f"PAR STATS, { newdoc } " )
5983
6084 if self .report_per_file :
61- self ._report_stats (file_counter , header_id = "FULL DOC" )
85+ self ._report_stats (file_counters ["seq" ], header_id = "SEQ STATS, FILE" )
86+ self ._report_stats (file_counters ["par" ], header_id = "PAR STATS, FILE" )
6287
63- self ._total_counter .update (file_counter )
88+ self ._total_counter ["seq" ].update (file_counters ["seq" ])
89+ self ._total_counter ["par" ].update (file_counters ["par" ])
6490
6591 def process_end (self ):
6692 if self .report_total :
67- self ._report_stats (header_id = "TOTAL" )
93+ self ._report_stats (self ._total_counter ["seq" ], header_id = "SEQ STATS, TOTAL" )
94+ self ._report_stats (self ._total_counter ["par" ], header_id = "PAR STATS, TOTAL" )
0 commit comments