44class Stats (Block ):
55 """Block corefud.Stats prints various coreference-related statistics."""
66
7- def __init__ (self , m_len_max = 5 , e_len_max = 5 , report_mentions = True , report_entities = True ,
8- report_details = True , selected_upos = 'NOUN PRON PROPN DET ADJ VERB ADV NUM' ,
7+ def __init__ (self , m_len_max = 5 , e_len_max = 5 ,
8+ report_basics = False , report_mentions = True , report_entities = True ,
9+ report_details = True , selected_upos = 'NOUN PRON PROPN DET ADJ VERB ADV NUM _' ,
910 exclude_singletons = False , exclude_nonsingletons = False , style = 'human' ,
10- per_doc = False , max_rows_per_page = 50 , ** kwargs ):
11+ per_doc = False , max_rows_per_page = 50 , docname = 'newdoc' , docname_len = 15 ,
12+ ** kwargs ):
1113 super ().__init__ (** kwargs )
1214 self .m_len_max = m_len_max
1315 self .e_len_max = e_len_max
16+ self .report_basics = report_basics
1417 self .report_mentions = report_mentions
1518 self .report_entities = report_entities
1619 self .report_details = report_details
@@ -21,6 +24,10 @@ def __init__(self, m_len_max=5, e_len_max=5, report_mentions=True, report_entiti
2124 raise ValueError (f'Unknown style { style } ' )
2225 self .per_doc = per_doc
2326 self .max_rows_per_page = max_rows_per_page
27+ if docname not in 'newdoc filename' .split ():
28+ raise ValueError (f'Unknown style { style } ' )
29+ self .docname = docname
30+ self .docname_len = docname_len
2431 self ._header_printed = False
2532 self ._lines_printed = None
2633
@@ -75,6 +82,12 @@ def process_document(self, doc):
7582 heads += 0 if any (d ['parent' ] in mwords for d in w .deps ) else 1
7683 self .counter ['m_nontreelet' ] += 1 if heads > 1 else 0
7784
85+ if self .report_basics :
86+ for tree in doc .trees :
87+ self .counter ['newdocs' ] += 1 if tree .newdoc else 0
88+ self .counter ['sents' ] += 1
89+ self .counter ['words' ] += len (tree .descendants )
90+ self .counter ['empty' ] += len (tree .empty_nodes )
7891
7992 def after_process_document (self , doc ):
8093 if self .per_doc :
@@ -97,7 +110,8 @@ def process_end(self, skip=True, doc=None):
97110 self .print_footer ()
98111 return
99112 else :
100- print (f"{ doc [0 ].trees [0 ].newdoc :15} " , end = '&' if self .style .startswith ('tex' ) else '\n ' )
113+ docname = doc .meta ['loaded_from' ] if self .docname == 'filename' else doc [0 ].trees [0 ].newdoc
114+ print (f"{ docname :{self .docname_len }} " , end = '&' if self .style .startswith ('tex' ) else '\n ' )
101115 elif self .style .startswith ('tex-' ):
102116 print (f"{ self .counter ['documents' ]:4} documents &" )
103117 self ._lines_printed += 1
@@ -107,6 +121,11 @@ def process_end(self, skip=True, doc=None):
107121 total_nodes_nonzero = 1 if self .total_nodes == 0 else self .total_nodes
108122
109123 columns = [ ]
124+ if self .report_basics :
125+ columns += [('docs' , f"{ self .counter ['newdocs' ]:7,} " ),
126+ ('sents' , f"{ self .counter ['sents' ]:7,} " ),
127+ ('words' , f"{ self .counter ['words' ]:7,} " ),
128+ ('empty' , f"{ self .counter ['empty' ]:7,} " ),]
110129 if self .report_entities :
111130 columns += [('entities' , f"{ self .entities :7,} " ),
112131 ('entities_per1k' , f"{ 1000 * self .entities / total_nodes_nonzero :6.0f} " ),
@@ -156,7 +175,15 @@ def print_header(self):
156175 print (r'\title{Udapi coreference statistics}' )
157176 print (r'\begin{document}' )
158177 print (r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}' )
159- lines = [r'\begin{mypage}\begin{tabular}{@{}l ' , " " * 15 , ("document" if self .per_doc else "dataset " ) + " " * 7 , " " * 15 ]
178+ lines = [r'\begin{mypage}\begin{tabular}{@{}l ' ,
179+ " " * self .docname_len ,
180+ ("document" if self .per_doc else "dataset " ) + " " * (self .docname_len - 8 ),
181+ " " * self .docname_len ]
182+ if self .report_basics :
183+ lines [0 ] += "rrrr "
184+ lines [1 ] += r'& \MC{4}{total number of} '
185+ lines [2 ] += r'& & & & '
186+ lines [3 ] += r'& docs & sents & words & empty n.'
160187 if self .report_entities :
161188 lines [0 ] += "rrrr "
162189 lines [1 ] += r'& \MC{4}{entities} '
@@ -199,10 +226,13 @@ def print_header(self):
199226 lines [1 ] += r'\\'
200227 lines [2 ] += r'\\'
201228 lines [3 ] += r'\\\midrule'
202- if self .report_entities :
229+ if self .report_basics :
203230 last_col += 4
204231 lines [1 ] += r'\cmidrule(lr){2-5}'
205- lines [2 ] += r'\cmidrule(lr){4-5}'
232+ if self .report_entities :
233+ lines [1 ] += r'\cmidrule(lr){' + f"{ last_col + 1 } -{ last_col + 4 } " + '}'
234+ lines [2 ] += r'\cmidrule(lr){' + f"{ last_col + 3 } -{ last_col + 4 } " + '}'
235+ last_col += 4
206236 if self .e_len_max :
207237 last_col += self .e_len_max
208238 lines [1 ] += r'\cmidrule(lr){6-' + str (last_col ) + '}'
0 commit comments