-
Notifications
You must be signed in to change notification settings - Fork 35
Expand file tree
/
Copy pathstats.py
More file actions
256 lines (243 loc) · 12.3 KB
/
stats.py
File metadata and controls
256 lines (243 loc) · 12.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
from udapi.core.block import Block
from collections import Counter
class Stats(Block):
"""Block corefud.Stats prints various coreference-related statistics."""
def __init__(self, m_len_max=5, e_len_max=5,
report_basics=False, report_mentions=True, report_entities=True,
report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM _',
exclude_singletons=False, exclude_nonsingletons=False, style='human',
per_doc=False, max_rows_per_page=50, docname='newdoc', docname_len=15,
**kwargs):
super().__init__(**kwargs)
self.m_len_max = m_len_max
self.e_len_max = e_len_max
self.report_basics = report_basics
self.report_mentions = report_mentions
self.report_entities = report_entities
self.report_details = report_details
self.exclude_singletons = exclude_singletons
self.exclude_nonsingletons = exclude_nonsingletons
self.style = style
if style not in 'tex tex-table tex-doc human'.split():
raise ValueError(f'Unknown style {style}')
self.per_doc = per_doc
self.max_rows_per_page = max_rows_per_page
if docname not in 'newdoc filename'.split():
raise ValueError(f'Unknown style {style}')
self.docname = docname
self.docname_len = docname_len
self._header_printed = False
self._lines_printed = None
self.counter = Counter()
self.mentions = 0
self.entities = 0
self.singletons = 0
self.total_nodes = 0
self.longest_mention = 0
self.longest_entity = 0
self.m_words = 0
self.selected_upos = None if selected_upos == 'all' else selected_upos.split()
def process_document(self, doc):
self.total_nodes += len(list(doc.nodes))
self.counter['documents'] += 1
for entity in doc.coref_entities:
len_mentions = len(entity.mentions)
if len_mentions == 1:
self.singletons += 1
if len_mentions == 1 and self.exclude_singletons:
continue
elif len_mentions > 1 and self.exclude_nonsingletons:
continue
self.longest_entity = max(len_mentions, self.longest_entity)
self.counter['c_total_len'] += len_mentions
self.counter[f"c_len_{min(len_mentions, self.e_len_max)}"] += 1
self.entities += 1
if not self.report_mentions and not self.report_details:
continue
for mention in entity.mentions:
self.mentions += 1
all_words = len(mention.words)
non_empty = len([w for w in mention.words if not w.is_empty()])
self.m_words += all_words
self.longest_mention = max(non_empty, self.longest_mention)
self.counter['m_total_len'] += non_empty
self.counter[f"m_len_{min(non_empty, self.m_len_max)}"] += 1
if self.report_details:
upos = 'other'
if not self.selected_upos or mention.head.upos in self.selected_upos:
upos = mention.head.upos
self.counter['m_head_upos_' + upos] += 1
self.counter['m_with_empty'] += 1 if all_words > non_empty else 0
self.counter['m_with_gaps'] += 1 if ',' in mention.span else 0
heads, mwords = 0, set(mention.words)
for w in mention.words:
if w.parent:
heads += 0 if w.parent in mwords else 1
else:
heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1
self.counter['m_nontreelet'] += 1 if heads > 1 else 0
if self.report_basics:
for tree in doc.trees:
self.counter['newdocs'] += 1 if tree.newdoc else 0
self.counter['sents'] += 1
self.counter['words'] += len(tree.descendants)
self.counter['empty'] += len(tree.empty_nodes)
def after_process_document(self, doc):
if self.per_doc:
self.process_end(skip=False, doc=doc)
self.counter = Counter()
self.mentions = 0
self.entities = 0
self.singletons = 0
self.total_nodes = 0
self.longest_mention = 0
self.longest_entity = 0
self.m_words = 0
def process_end(self, skip=True, doc=None):
if not self._lines_printed:
self.print_header()
self._lines_printed = 0
if self.per_doc:
if skip:
self.print_footer()
return
else:
docname = doc.meta['loaded_from'] if self.docname == 'filename' else doc[0].trees[0].newdoc
print(f"{docname:{self.docname_len}}", end='&' if self.style.startswith('tex') else '\n')
elif self.style.startswith('tex-'):
print(f"{self.counter['documents']:4} documents &")
self._lines_printed += 1
mentions_nonzero = 1 if self.mentions == 0 else self.mentions
entities_nonzero = 1 if self.entities == 0 else self.entities
total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes
columns =[ ]
if self.report_basics:
columns += [('docs', f"{self.counter['newdocs']:7,}"),
('sents', f"{self.counter['sents']:7,}"),
('words', f"{self.counter['words']:7,}"),
('empty', f"{self.counter['empty']:7,}"),]
if self.report_entities:
columns += [('entities', f"{self.entities:7,}"),
('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"),
('longest_entity', f"{self.longest_entity:6}"),
('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")]
for i in range(1, self.e_len_max + 1):
percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero
columns.append((f"c_len_{i}{'' if i < self.e_len_max else '+'}", f"{percent:5.1f}"))
if self.report_mentions:
columns += [('mentions', f"{self.mentions:7,}"),
('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"),
('longest_mention', f"{self.longest_mention:6}"),
('avg_mention', f"{self.counter['m_total_len'] / mentions_nonzero:5.1f}")]
if self.m_len_max:
for i in range(0, self.m_len_max + 1):
percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero
columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}"))
if self.report_details:
columns += [('with_empty', f"{100 * self.counter['m_with_empty'] / mentions_nonzero:5.1f}"),
('with_gaps', f"{100 * self.counter['m_with_gaps'] / mentions_nonzero:5.1f}"),
('nontreelet', f"{100 * self.counter['m_nontreelet'] / mentions_nonzero:5.1f}"),]
if self.selected_upos:
upos_list = self.selected_upos + ['other']
else:
upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')]
for upos in upos_list:
columns.append(('head_upos=' + upos, f"{100 * self.counter['m_head_upos_' + upos] / mentions_nonzero:5.1f}"))
if self.style.startswith('tex'):
print(" & ".join(c[1] for c in columns), end=" \\\\\n")
elif self.style == 'human':
for c in columns:
print(f"{c[0]:>15} = {c[1].strip():>10}")
if not self.per_doc:
self.print_footer()
elif self._lines_printed > self.max_rows_per_page:
self.print_footer(False)
self._lines_printed = 0
def print_header(self):
if not self.style.startswith('tex-'):
return
if self.style == 'tex-doc':
if self._lines_printed is None:
print(r'\documentclass[multi=mypage]{standalone}')
print(r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}')
print(r'\title{Udapi coreference statistics}')
print(r'\begin{document}')
print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}')
lines = [r'\begin{mypage}\begin{tabular}{@{}l ',
" " * self.docname_len,
("document" if self.per_doc else "dataset ") + " " * (self.docname_len-8),
" " * self.docname_len]
if self.report_basics:
lines[0] += "rrrr "
lines[1] += r'& \MC{4}{total number of} '
lines[2] += r'& & & & '
lines[3] += r'& docs & sents & words & empty n.'
if self.report_entities:
lines[0] += "rrrr "
lines[1] += r'& \MC{4}{entities} '
lines[2] += r'& total & per 1k & \MC{2}{length} '
lines[3] += r'& count & words & max & avg. '
if self.e_len_max:
for i in range(1, self.e_len_max + 1):
lines[0] += "r"
lines[2] += f"& {i:4}" + ("+ " if i==self.e_len_max else " ")
lines[3] += r'& [\%] '
lines[0] += " "
lines[1] += r'& \MC{' + str(self.e_len_max) + r'}{distribution of entity lengths}'
if self.report_mentions:
lines[0] += "rrrr "
lines[1] += r'& \MC{4}{mentions} '
lines[2] += r'& total & per 1k & \MC{2}{length} '
lines[3] += r'& count & words & max & avg. '
if self.m_len_max:
for i in range(0, self.m_len_max + 1):
lines[0] += "r"
lines[2] += f"& {i:4}" + ("+ " if i==self.m_len_max else " ")
lines[3] += r'& [\%] '
lines[0] += " "
lines[1] += r'& \MC{' + str(self.m_len_max + 1) + r'}{distribution of mention lengths}' + " "*7
if self.report_details:
lines[0] += "rrrr "
lines[1] += r'& \MC{3}{mention type} '
lines[2] += r'&w/empty& w/gap&non-tree'
lines[3] += r'& [\%] ' * 3
if self.selected_upos:
upos_list = self.selected_upos + ['other']
else:
upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')]
lines[0] += "@{~}r" * len(upos_list)
lines[1] += r"& \MC{" + str(len(upos_list)) + r"}{distribution of head UPOS}"
lines[2] += ''.join(f'&{upos:7}' for upos in upos_list)
lines[3] += r'& [\%] ' * len(upos_list)
lines[0] += r'@{}}\toprule'
last_col = 1
lines[1] += r'\\'
lines[2] += r'\\'
lines[3] += r'\\\midrule'
if self.report_basics:
last_col += 4
lines[1] += r'\cmidrule(lr){2-5}'
if self.report_entities:
lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}'
lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}'
last_col += 4
if self.e_len_max:
last_col += self.e_len_max
lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}'
if self.report_mentions:
lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}'
lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}'
last_col += 4
if self.m_len_max:
lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+self.m_len_max+1}" + '}'
last_col += self.m_len_max + 1
if self.report_details:
lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+3}"
lines[1] += r'}\cmidrule(l){' + f"{last_col+4}-{last_col+3+len(upos_list)}" + '}'
print("\n".join(lines))
def print_footer(self, end_doc=True):
if not self.style.startswith('tex-'):
return
print(r'\bottomrule\end{tabular}\end{mypage}')
if self.style == 'tex-doc' and end_doc:
print(r'\end{document}')