Skip to content

Commit 020206b

Browse files
committed
add Chapter7
1 parent fabcb32 commit 020206b

1 file changed

Lines changed: 55 additions & 0 deletions

File tree

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env python3
2+
#!encoding=utf-8
3+
4+
from urllib.request import urlopen
5+
from bs4 import BeautifulSoup
6+
import re
7+
import string
8+
from collections import OrderedDict
9+
10+
11+
def cleanInput(input):
12+
'''
13+
移除转义字符,过滤Unicode字符
14+
'''
15+
input = re.sub('\n+', " ", input) #替换换行符
16+
input = re.sub('\[[0-9]*\]', "", input)#除去数字
17+
input = re.sub(' +', " ", input) #除去多余的空格
18+
input = bytes(input, "UTF-8") #更改编码
19+
input = input.decode("ascii", "ignore")
20+
21+
cleanInput = []
22+
input = input.split(' ')
23+
24+
for item in input:
25+
item = item.strip(string.punctuation)
26+
if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
27+
cleanInput.append(item)
28+
return cleanInput
29+
30+
31+
def getNgrams(input, n):
32+
input = cleanInput(input)
33+
output = dict()
34+
for i in range(len(input)-n+1):
35+
newNGram = " ".join(input[i:i+n])
36+
if newNGram in output:
37+
output[newNGram] += 1
38+
else:
39+
output[newNGram] = 1
40+
return output
41+
42+
43+
44+
html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
45+
bsObj = BeautifulSoup(html,"html.parser")
46+
content = bsObj.find("div", {"id":"mw-content-text"}).get_text()
47+
48+
49+
ngrams = getNgrams(content, 2)
50+
51+
#使用OrderedDict 去重
52+
ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True))
53+
54+
55+
print(ngrams)

0 commit comments

Comments
 (0)