Skip to content

Commit 660f6e0

Browse files
committed
corefud.Stats report_basics=1
1 parent f747fa0 commit 660f6e0

1 file changed

Lines changed: 37 additions & 7 deletions

File tree

udapi/block/corefud/stats.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,16 @@
44
class Stats(Block):
55
"""Block corefud.Stats prints various coreference-related statistics."""
66

7-
def __init__(self, m_len_max=5, e_len_max=5, report_mentions=True, report_entities=True,
8-
report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM',
7+
def __init__(self, m_len_max=5, e_len_max=5,
8+
report_basics=False, report_mentions=True, report_entities=True,
9+
report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM _',
910
exclude_singletons=False, exclude_nonsingletons=False, style='human',
10-
per_doc=False, max_rows_per_page=50, **kwargs):
11+
per_doc=False, max_rows_per_page=50, docname='newdoc', docname_len=15,
12+
**kwargs):
1113
super().__init__(**kwargs)
1214
self.m_len_max = m_len_max
1315
self.e_len_max = e_len_max
16+
self.report_basics = report_basics
1417
self.report_mentions = report_mentions
1518
self.report_entities = report_entities
1619
self.report_details = report_details
@@ -21,6 +24,10 @@ def __init__(self, m_len_max=5, e_len_max=5, report_mentions=True, report_entiti
2124
raise ValueError(f'Unknown style {style}')
2225
self.per_doc = per_doc
2326
self.max_rows_per_page = max_rows_per_page
27+
if docname not in 'newdoc filename'.split():
28+
raise ValueError(f'Unknown style {style}')
29+
self.docname = docname
30+
self.docname_len = docname_len
2431
self._header_printed = False
2532
self._lines_printed = None
2633

@@ -75,6 +82,12 @@ def process_document(self, doc):
7582
heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1
7683
self.counter['m_nontreelet'] += 1 if heads > 1 else 0
7784

85+
if self.report_basics:
86+
for tree in doc.trees:
87+
self.counter['newdocs'] += 1 if tree.newdoc else 0
88+
self.counter['sents'] += 1
89+
self.counter['words'] += len(tree.descendants)
90+
self.counter['empty'] += len(tree.empty_nodes)
7891

7992
def after_process_document(self, doc):
8093
if self.per_doc:
@@ -97,7 +110,8 @@ def process_end(self, skip=True, doc=None):
97110
self.print_footer()
98111
return
99112
else:
100-
print(f"{doc[0].trees[0].newdoc:15}", end='&' if self.style.startswith('tex') else '\n')
113+
docname = doc.meta['loaded_from'] if self.docname == 'filename' else doc[0].trees[0].newdoc
114+
print(f"{docname:{self.docname_len}}", end='&' if self.style.startswith('tex') else '\n')
101115
elif self.style.startswith('tex-'):
102116
print(f"{self.counter['documents']:4} documents &")
103117
self._lines_printed += 1
@@ -107,6 +121,11 @@ def process_end(self, skip=True, doc=None):
107121
total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes
108122

109123
columns =[ ]
124+
if self.report_basics:
125+
columns += [('docs', f"{self.counter['newdocs']:7,}"),
126+
('sents', f"{self.counter['sents']:7,}"),
127+
('words', f"{self.counter['words']:7,}"),
128+
('empty', f"{self.counter['empty']:7,}"),]
110129
if self.report_entities:
111130
columns += [('entities', f"{self.entities:7,}"),
112131
('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"),
@@ -156,7 +175,15 @@ def print_header(self):
156175
print(r'\title{Udapi coreference statistics}')
157176
print(r'\begin{document}')
158177
print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}')
159-
lines = [r'\begin{mypage}\begin{tabular}{@{}l ', " "*15, ("document" if self.per_doc else "dataset ") + " "*7, " "*15]
178+
lines = [r'\begin{mypage}\begin{tabular}{@{}l ',
179+
" " * self.docname_len,
180+
("document" if self.per_doc else "dataset ") + " " * (self.docname_len-8),
181+
" " * self.docname_len]
182+
if self.report_basics:
183+
lines[0] += "rrrr "
184+
lines[1] += r'& \MC{4}{total number of} '
185+
lines[2] += r'& & & & '
186+
lines[3] += r'& docs & sents & words & empty n.'
160187
if self.report_entities:
161188
lines[0] += "rrrr "
162189
lines[1] += r'& \MC{4}{entities} '
@@ -199,10 +226,13 @@ def print_header(self):
199226
lines[1] += r'\\'
200227
lines[2] += r'\\'
201228
lines[3] += r'\\\midrule'
202-
if self.report_entities:
229+
if self.report_basics:
203230
last_col += 4
204231
lines[1] += r'\cmidrule(lr){2-5}'
205-
lines[2] += r'\cmidrule(lr){4-5}'
232+
if self.report_entities:
233+
lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}'
234+
lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}'
235+
last_col += 4
206236
if self.e_len_max:
207237
last_col += self.e_len_max
208238
lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}'

0 commit comments

Comments
 (0)