|
3 | 3 | from collections import Counter |
4 | 4 | import argparse |
5 | 5 | import re |
6 | | -from itertools import islice |
7 | | -import operator |
8 | | - |
9 | | -url_re = re.compile( |
10 | | - 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' |
11 | | -) |
12 | | - |
| 6 | +from toolz.itertoolz import sliding_window |
13 | 7 |
|
14 | 8 | parser = argparse.ArgumentParser() |
15 | 9 | parser.add_argument('--numWords',type=int,default=10) |
|
18 | 12 | parser.add_argument('file',type=str) |
19 | 13 | args = parser.parse_args() |
20 | 14 |
|
21 | | -# Inspired by http://stackoverflow.com/questions/6822725 |
22 | | -def window(seq, n): |
23 | | - it = iter(seq) |
24 | | - result = tuple(islice(it, n)) |
25 | | - if len(result) == n: |
26 | | - yield result |
27 | | - for elem in it: |
28 | | - result = result[1:] + (elem,) |
29 | | - useThisWindow = True |
30 | | - for i in result: |
31 | | - if len(i) < args.minWordLength: |
32 | | - useThisWindow = False |
33 | | - break |
34 | | - elif url_re.search(i): |
35 | | - useThisWindow = False |
36 | | - break |
37 | | - if useThisWindow: |
38 | | - yield result |
| 15 | +def filter_func(tup): |
| 16 | + for word in tup: |
| 17 | + if len(word) < args.minWordLength: |
| 18 | + return False |
| 19 | + elif re.search('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',word): |
| 20 | + return False |
| 21 | + else: |
| 22 | + return True |
39 | 23 |
|
| 24 | +def filtered_window(seq,n): |
| 25 | + return filter(filter_func, sliding_window(n,seq)) |
| 26 | + |
40 | 27 | with open(args.file,'r') as f: |
41 | 28 | content = f.read().replace('\n',' ').lower() |
42 | 29 | words = re.findall(r'\S+', content) |
43 | 30 | for i in range(1,args.maxTuples+1): |
44 | 31 | print("\n=== Sliding Window: {} ===".format(i)) |
45 | | - for tup in Counter(window(words,i)).most_common(args.numWords): |
| 32 | + for tup in Counter(filtered_window(words,i)).most_common(args.numWords): |
46 | 33 | print(" {}: '{}'".format(tup[1]," ".join(tup[0]))) |
0 commit comments