44class Stats (Block ):
55 """Block corefud.Stats prints various coreference-related statistics."""
66
7- def __init__ (self , m_len_max = 5 , c_len_max = 5 , report_mentions = True , report_entities = True ,
7+ def __init__ (self , m_len_max = 5 , e_len_max = 5 , report_mentions = True , report_entities = True ,
88 report_details = True , selected_upos = 'NOUN PRON PROPN DET ADJ VERB ADV NUM' ,
9- exclude_singletons = False , exclude_nonsingletons = False , style = 'human' , ** kwargs ):
9+ exclude_singletons = False , exclude_nonsingletons = False , style = 'human' ,
10+ per_doc = False , ** kwargs ):
1011 super ().__init__ (** kwargs )
1112 self .m_len_max = m_len_max
12- self .c_len_max = c_len_max
13+ self .e_len_max = e_len_max
1314 self .report_mentions = report_mentions
1415 self .report_entities = report_entities
1516 self .report_details = report_details
1617 self .exclude_singletons = exclude_singletons
1718 self .exclude_nonsingletons = exclude_nonsingletons
1819 self .style = style
19- if style not in 'tex human' .split ():
20- raise ValueError (f'Unknown style f{ style } ' )
20+ if style not in 'tex tex-table tex-doc human' .split ():
21+ raise ValueError (f'Unknown style { style } ' )
22+ self .per_doc = per_doc
23+ self ._header_printed = False
2124
2225 self .counter = Counter ()
2326 self .mentions = 0
@@ -31,6 +34,7 @@ def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_entiti
3134
3235 def process_document (self , doc ):
3336 self .total_nodes += len (list (doc .nodes ))
37+ self .counter ['documents' ] += 1
3438 for entity in doc .coref_entities :
3539 len_mentions = len (entity .mentions )
3640 if len_mentions == 1 :
@@ -41,7 +45,7 @@ def process_document(self, doc):
4145 continue
4246 self .longest_entity = max (len_mentions , self .longest_entity )
4347 self .counter ['c_total_len' ] += len_mentions
44- self .counter [f"c_len_{ min (len_mentions , self .c_len_max )} " ] += 1
48+ self .counter [f"c_len_{ min (len_mentions , self .e_len_max )} " ] += 1
4549
4650 self .entities += 1
4751 if not self .report_mentions and not self .report_details :
@@ -69,7 +73,32 @@ def process_document(self, doc):
6973 heads += 0 if any (d ['parent' ] in mwords for d in w .deps ) else 1
7074 self .counter ['m_nontreelet' ] += 1 if heads > 1 else 0
7175
72- def process_end (self ):
76+
77+ def after_process_document (self , doc ):
78+ if self .per_doc :
79+ self .process_end (skip = False , doc = doc )
80+ self .counter = Counter ()
81+ self .mentions = 0
82+ self .entities = 0
83+ self .singletons = 0
84+ self .total_nodes = 0
85+ self .longest_mention = 0
86+ self .longest_entity = 0
87+ self .m_words = 0
88+
89+ def process_end (self , skip = True , doc = None ):
90+ if not self ._header_printed :
91+ self ._header_printed = True
92+ self .print_header ()
93+ if self .per_doc :
94+ if skip :
95+ self .print_footer ()
96+ return
97+ else :
98+ print (f"{ doc [0 ].trees [0 ].newdoc :15} " , end = '&' if self .style .startswith ('tex' ) else '\n ' )
99+ elif self .style .startswith ('tex-' ):
100+ print (f"{ self .counter ['documents' ]:4} documents &" )
101+
73102 mentions_nonzero = 1 if self .mentions == 0 else self .mentions
74103 entities_nonzero = 1 if self .entities == 0 else self .entities
75104 total_nodes_nonzero = 1 if self .total_nodes == 0 else self .total_nodes
@@ -80,17 +109,18 @@ def process_end(self):
80109 ('entities_per1k' , f"{ 1000 * self .entities / total_nodes_nonzero :6.0f} " ),
81110 ('longest_entity' , f"{ self .longest_entity :6} " ),
82111 ('avg_entity' , f"{ self .counter ['c_total_len' ] / entities_nonzero :5.1f} " )]
83- for i in range (1 , self .c_len_max + 1 ):
112+ for i in range (1 , self .e_len_max + 1 ):
84113 percent = 100 * self .counter [f"c_len_{ i } " ] / entities_nonzero
85- columns .append ((f"c_len_{ i } { '' if i < self .c_len_max else '+' } " , f"{ percent :5.1f} " ))
114+ columns .append ((f"c_len_{ i } { '' if i < self .e_len_max else '+' } " , f"{ percent :5.1f} " ))
86115 if self .report_mentions :
87116 columns += [('mentions' , f"{ self .mentions :7,} " ),
88117 ('mentions_per1k' , f"{ 1000 * self .mentions / total_nodes_nonzero :6.0f} " ),
89118 ('longest_mention' , f"{ self .longest_mention :6} " ),
90119 ('avg_mention' , f"{ self .counter ['m_total_len' ] / mentions_nonzero :5.1f} " )]
91- for i in range (0 , self .m_len_max + 1 ):
92- percent = 100 * self .counter [f"m_len_{ i } " ] / mentions_nonzero
93- columns .append ((f"m_len_{ i } { '' if i < self .m_len_max else '+' } " , f"{ percent :5.1f} " ))
120+ if self .m_len_max :
121+ for i in range (0 , self .m_len_max + 1 ):
122+ percent = 100 * self .counter [f"m_len_{ i } " ] / mentions_nonzero
123+ columns .append ((f"m_len_{ i } { '' if i < self .m_len_max else '+' } " , f"{ percent :5.1f} " ))
94124 if self .report_details :
95125 columns += [('with_empty' , f"{ 100 * self .counter ['m_with_empty' ] / mentions_nonzero :5.1f} " ),
96126 ('with_gaps' , f"{ 100 * self .counter ['m_with_gaps' ] / mentions_nonzero :5.1f} " ),
@@ -102,8 +132,88 @@ def process_end(self):
102132 for upos in upos_list :
103133 columns .append (('head_upos=' + upos , f"{ 100 * self .counter ['m_head_upos_' + upos ] / mentions_nonzero :5.1f} " ))
104134
105- if self .style == 'tex' :
106- print (" & " .join (c [1 ] for c in columns ))
135+ if self .style . startswith ( 'tex' ) :
136+ print (" & " .join (c [1 ] for c in columns ), end = " \\ \\ \n " )
107137 elif self .style == 'human' :
108138 for c in columns :
109139 print (f"{ c [0 ]:>15} = { c [1 ].strip ():>10} " )
140+ if not self .per_doc :
141+ self .print_footer ()
142+
143+ def print_header (self ):
144+ if not self .style .startswith ('tex-' ):
145+ return
146+ if self .style == 'tex-doc' :
147+ print (r'\documentclass{standalone}' )
148+ print (r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}' )
149+ print (r'\title{Udapi coreference statistics}' )
150+ print (r'\begin{document}' )
151+ print (r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}' )
152+ lines = [r'\begin{tabular}{@{}l ' , " " * 15 , ("document" if self .per_doc else "dataset " ) + " " * 7 , " " * 15 ]
153+ if self .report_entities :
154+ lines [0 ] += "rrrr "
155+ lines [1 ] += r'& \MC{4}{entities} '
156+ lines [2 ] += r'& total & per 1k & \MC{2}{length} '
157+ lines [3 ] += r'& count & words & max & avg. '
158+ if self .e_len_max :
159+ for i in range (1 , self .e_len_max + 1 ):
160+ lines [0 ] += "r"
161+ lines [2 ] += f"& { i :4} " + ("+ " if i == self .e_len_max else " " )
162+ lines [3 ] += r'& [\%] '
163+ lines [0 ] += " "
164+ lines [1 ] += r'& \MC{' + str (self .e_len_max ) + r'}{distribution of entity lengths}'
165+ if self .report_mentions :
166+ lines [0 ] += "rrrr "
167+ lines [1 ] += r'& \MC{4}{mentions} '
168+ lines [2 ] += r'& total & per 1k & \MC{2}{length} '
169+ lines [3 ] += r'& count & words & max & avg. '
170+ if self .m_len_max :
171+ for i in range (0 , self .m_len_max + 1 ):
172+ lines [0 ] += "r"
173+ lines [2 ] += f"& { i :4} " + ("+ " if i == self .m_len_max else " " )
174+ lines [3 ] += r'& [\%] '
175+ lines [0 ] += " "
176+ lines [1 ] += r'& \MC{' + str (self .m_len_max + 1 ) + r'}{distribution of mention lengths}' + " " * 7
177+ if self .report_details :
178+ lines [0 ] += "rrrr "
179+ lines [1 ] += r'& \MC{3}{mention type} '
180+ lines [2 ] += r'&w/empty& w/gap&non-tree'
181+ lines [3 ] += r'& [\%] ' * 3
182+ if self .selected_upos :
183+ upos_list = self .selected_upos + ['other' ]
184+ else :
185+ upos_list = [x [12 :] for x in self .counter if x .startswith ('m_head_upos_' )]
186+ lines [0 ] += "@{~}r" * len (upos_list )
187+ lines [1 ] += r"& \MC{" + str (len (upos_list )) + r"}{distribution of head UPOS}"
188+ lines [2 ] += '' .join (f'&{ upos :7} ' for upos in upos_list )
189+ lines [3 ] += r'& [\%] ' * len (upos_list )
190+ lines [0 ] += r'@{}}\toprule'
191+ last_col = 1
192+ lines [1 ] += r'\\'
193+ lines [2 ] += r'\\'
194+ lines [3 ] += r'\\\midrule'
195+ if self .report_entities :
196+ last_col += 4
197+ lines [1 ] += r'\cmidrule(lr){2-5}'
198+ lines [2 ] += r'\cmidrule(lr){4-5}'
199+ if self .e_len_max :
200+ last_col += self .e_len_max
201+ lines [1 ] += r'\cmidrule(lr){6-' + str (last_col ) + '}'
202+ if self .report_mentions :
203+ lines [1 ] += r'\cmidrule(lr){' + f"{ last_col + 1 } -{ last_col + 4 } " + '}'
204+ lines [2 ] += r'\cmidrule(lr){' + f"{ last_col + 3 } -{ last_col + 4 } " + '}'
205+ last_col += 4
206+ if self .m_len_max :
207+ lines [1 ] += r'\cmidrule(lr){' + f"{ last_col + 1 } -{ last_col + self .m_len_max + 1 } " + '}'
208+ last_col += self .m_len_max + 1
209+ if self .report_details :
210+ lines [1 ] += r'\cmidrule(lr){' + f"{ last_col + 1 } -{ last_col + 3 } "
211+ lines [1 ] += r'}\cmidrule(l){' + f"{ last_col + 4 } -{ last_col + 3 + len (upos_list )} " + '}'
212+ print ("\n " .join (lines ))
213+
214+ def print_footer (self ):
215+ if not self .style .startswith ('tex-' ):
216+ return
217+ print (r'\bottomrule\end{tabular}' )
218+ if self .style == 'tex-doc' :
219+ print (r'\end{document}' )
0 commit comments