|
| 1 | +#! /usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +# vim:fenc=utf-8 |
| 4 | +# Copyright By PyLyria |
| 5 | +# CreateTime: 2016-03-03 20:51:40 |
| 6 | +import os |
| 7 | +import re |
| 8 | +import math |
| 9 | +import heapq |
| 10 | +from string import punctuation |
| 11 | +from operator import itemgetter |
| 12 | + |
| 13 | +def remove_punctuation(text): |
| 14 | + text = re.sub(r'[{}]+'.format(punctuation), '', text) |
| 15 | + return text.strip().lower() |
| 16 | + |
| 17 | +def split(file_name): |
| 18 | + with open(file_name,'rt',encoding='utf-8') as f: |
| 19 | + lines = (line.strip() for line in f) |
| 20 | + for line in lines: |
| 21 | + yield re.split(r'[;,\s]\s*', line) |
| 22 | + |
| 23 | +def get_path(root = os.curdir): |
| 24 | + root += os.sep |
| 25 | + for path, dirs, files in os.walk(root): |
| 26 | + for file_name in files: |
| 27 | + yield path, file_name |
| 28 | + |
| 29 | +def get_tf(file_name): |
| 30 | + word2count = {} |
| 31 | + for line in split(file_name): |
| 32 | + words = (remove_punctuation(word) for word in line) |
| 33 | + for word in words: |
| 34 | + word2count[word] = word2count.get(word, 0) + 1 |
| 35 | + total = sum(word2count.values()) |
| 36 | + TF = {key : (value, value / total) for (key, value) in word2count.items()} |
| 37 | + return TF |
| 38 | + |
| 39 | +def get_IDF(total_TF): |
| 40 | + IDF = {} |
| 41 | + for file_name in total_TF.keys(): |
| 42 | + for keyword in total_TF[file_name].keys(): |
| 43 | + IDF[keyword] = IDF.get(keyword, 0) + 1 |
| 44 | + IDF = {keyword: math.log(len(total_TF)/IDF[keyword], 2) for keyword in IDF.keys()} |
| 45 | + return IDF |
| 46 | + |
| 47 | +def get_weight(TF, IDF): |
| 48 | + weight = {key:TF[key][1]*IDF[key] for key in TF} |
| 49 | + return weight |
| 50 | + |
| 51 | +if __name__ == '__main__': |
| 52 | + paths = get_path() |
| 53 | + format = ('.txt') |
| 54 | + total_TF = {} |
| 55 | + word_weight = {} |
| 56 | + |
| 57 | + for path, file_name in paths: |
| 58 | + if file_name.endswith(format): |
| 59 | + total_TF[file_name] = get_tf(path + os.sep + file_name) |
| 60 | + |
| 61 | + IDF = get_IDF(total_TF) |
| 62 | + |
| 63 | + for file_name in total_TF.keys(): |
| 64 | + word_weight[file_name] = get_weight(total_TF[file_name], IDF) |
| 65 | + print(heapq.nlargest(5, word_weight[file_name].items(), key=itemgetter(1))) |
0 commit comments