Skip to content

Commit 10b1dda

Browse files
authored
Create 0006.py
1 parent 008d218 commit 10b1dda

1 file changed

Lines changed: 65 additions & 0 deletions

File tree

pylyria/0006/0006.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#! /usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
# vim:fenc=utf-8
4+
# Copyright By PyLyria
5+
# CreateTime: 2016-03-03 20:51:40
6+
import os
7+
import re
8+
import math
9+
import heapq
10+
from string import punctuation
11+
from operator import itemgetter
12+
13+
def remove_punctuation(text):
14+
text = re.sub(r'[{}]+'.format(punctuation), '', text)
15+
return text.strip().lower()
16+
17+
def split(file_name):
18+
with open(file_name,'rt',encoding='utf-8') as f:
19+
lines = (line.strip() for line in f)
20+
for line in lines:
21+
yield re.split(r'[;,\s]\s*', line)
22+
23+
def get_path(root = os.curdir):
24+
root += os.sep
25+
for path, dirs, files in os.walk(root):
26+
for file_name in files:
27+
yield path, file_name
28+
29+
def get_tf(file_name):
30+
word2count = {}
31+
for line in split(file_name):
32+
words = (remove_punctuation(word) for word in line)
33+
for word in words:
34+
word2count[word] = word2count.get(word, 0) + 1
35+
total = sum(word2count.values())
36+
TF = {key : (value, value / total) for (key, value) in word2count.items()}
37+
return TF
38+
39+
def get_IDF(total_TF):
40+
IDF = {}
41+
for file_name in total_TF.keys():
42+
for keyword in total_TF[file_name].keys():
43+
IDF[keyword] = IDF.get(keyword, 0) + 1
44+
IDF = {keyword: math.log(len(total_TF)/IDF[keyword], 2) for keyword in IDF.keys()}
45+
return IDF
46+
47+
def get_weight(TF, IDF):
48+
weight = {key:TF[key][1]*IDF[key] for key in TF}
49+
return weight
50+
51+
if __name__ == '__main__':
52+
paths = get_path()
53+
format = ('.txt')
54+
total_TF = {}
55+
word_weight = {}
56+
57+
for path, file_name in paths:
58+
if file_name.endswith(format):
59+
total_TF[file_name] = get_tf(path + os.sep + file_name)
60+
61+
IDF = get_IDF(total_TF)
62+
63+
for file_name in total_TF.keys():
64+
word_weight[file_name] = get_weight(total_TF[file_name], IDF)
65+
print(heapq.nlargest(5, word_weight[file_name].items(), key=itemgetter(1)))

0 commit comments

Comments
 (0)