1+ #!/usr/bin/env python3
2+ #!encoding=utf-8
3+
4+ from urllib .request import urlopen
5+ from bs4 import BeautifulSoup
6+ import re
7+ import string
8+ from collections import OrderedDict
9+
10+
11+ def cleanInput (input ):
12+ '''
13+ 移除转义字符,过滤Unicode字符
14+ '''
15+ input = re .sub ('\n +' , " " , input ) #替换换行符
16+ input = re .sub ('\[[0-9]*\]' , "" , input )#除去数字
17+ input = re .sub (' +' , " " , input ) #除去多余的空格
18+ input = bytes (input , "UTF-8" ) #更改编码
19+ input = input .decode ("ascii" , "ignore" )
20+
21+ cleanInput = []
22+ input = input .split (' ' )
23+
24+ for item in input :
25+ item = item .strip (string .punctuation )
26+ if len (item ) > 1 or (item .lower () == 'a' or item .lower () == 'i' ):
27+ cleanInput .append (item )
28+ return cleanInput
29+
30+
31+ def getNgrams (input , n ):
32+ input = cleanInput (input )
33+ output = dict ()
34+ for i in range (len (input )- n + 1 ):
35+ newNGram = " " .join (input [i :i + n ])
36+ if newNGram in output :
37+ output [newNGram ] += 1
38+ else :
39+ output [newNGram ] = 1
40+ return output
41+
42+
43+
44+ html = urlopen ("http://en.wikipedia.org/wiki/Python_(programming_language)" )
45+ bsObj = BeautifulSoup (html ,"html.parser" )
46+ content = bsObj .find ("div" , {"id" :"mw-content-text" }).get_text ()
47+
48+
49+ ngrams = getNgrams (content , 2 )
50+
51+ #使用OrderedDict 去重
52+ ngrams = OrderedDict (sorted (ngrams .items (), key = lambda t : t [1 ], reverse = True ))
53+
54+
55+ print (ngrams )
0 commit comments