-
Notifications
You must be signed in to change notification settings - Fork 35
Expand file tree
/
Copy pathmiscstatstex.py
More file actions
44 lines (32 loc) · 1.63 KB
/
miscstatstex.py
File metadata and controls
44 lines (32 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from udapi.core.block import Block
from collections import Counter
import re
class MiscStatsTex(Block):
"""Block corefud.MiscStats prints 10 most frequent values of each attribute stored in the MISC field"""
def __init__(self, maxvalues=10, **kwargs):
"""Create the corefud.MiscStats
Args:
maxvalues: the number of most frequent values
to be printed for each attribute.
"""
super().__init__(**kwargs)
self.maxvalues = maxvalues
self.valuecounter = {}
self.totalcounter = Counter()
def process_node(self,node):
for attrname in node.misc:
shortattrname = re.sub(r'\[\d+\]',r'',attrname)
if not shortattrname in self.valuecounter:
self.valuecounter[shortattrname] = Counter()
self.valuecounter[shortattrname][node.misc[attrname]] += 1
self.totalcounter[shortattrname] += 1
def process_end(self):
for attrname in self.valuecounter:
total = self.totalcounter[attrname]
distrvalues = []
for value,freq in self.valuecounter[attrname].most_common(self.maxvalues):
value = re.sub(r'_',r'\\_',value)
distrvalues.append(f'\\attr{{{str(value)}}} {100*freq/total:2.1f}~\\%')
attrname = re.sub(r'_',r'\\_',attrname)
print(f" \\item attribute \\attr{{{attrname}}}, {total:,} occurrences, values: "+", ".join(distrvalues))
# print(f" \\item attribute \\attr\{{attrname}\}, {str(total)} occurrences, distribution of values: "+", ".join(distrvalues))