Skip to content

Commit 1b57959

Browse files
committed
add File
1 parent 2fd47e4 commit 1b57959

11 files changed

Lines changed: 744 additions & 0 deletions

File tree

51BLOG/getexcel3.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/bin/env python
2+
# -*- coding:utf-8 -*-
3+
# @Author : kaliarch
4+
5+
import xlsxwriter
6+
7+
class create_excle:
8+
def __init__(self):
9+
self.tag_list = ["blog_name", "blog_url"]
10+
11+
def create_workbook(self,search=" "):
12+
excle_name = search + '.xlsx'
13+
#定义excle名称
14+
workbook = xlsxwriter.Workbook(excle_name)
15+
worksheet_M = workbook.add_worksheet(search)
16+
print('create %s....' % excle_name)
17+
return workbook,worksheet_M
18+
19+
def col_row(self,worksheet):
20+
worksheet.set_column('A:A', 12)
21+
worksheet.set_row(0, 17)
22+
worksheet.set_column('A:A',58)
23+
worksheet.set_column('B:B', 58)
24+
25+
def shell_format(self,workbook):
26+
#表头格式
27+
merge_format = workbook.add_format({
28+
'bold': 1,
29+
'border': 1,
30+
'align': 'center',
31+
'valign': 'vcenter',
32+
'fg_color': '#FAEBD7'
33+
})
34+
#标题格式
35+
name_format = workbook.add_format({
36+
'bold': 1,
37+
'border': 1,
38+
'align': 'center',
39+
'valign': 'vcenter',
40+
'fg_color': '#E0FFFF'
41+
})
42+
#正文格式
43+
normal_format = workbook.add_format({
44+
'align': 'center',
45+
})
46+
return merge_format,name_format,normal_format
47+
48+
#写入title和列名
49+
def write_title(self,worksheet,search,merge_format):
50+
title = search + "搜索结果"
51+
worksheet.merge_range('A1:B1', title, merge_format)
52+
print('write title success')
53+
54+
def write_tag(self,worksheet,name_format):
55+
tag_row = 1
56+
tag_col = 0
57+
for num in self.tag_list:
58+
worksheet.write(tag_row,tag_col,num,name_format)
59+
tag_col += 1
60+
print('write tag success')
61+
62+
#写入内容
63+
def write_context(self,worksheet,con_dic,normal_format):
64+
row = 2
65+
for k,v in con_dic.items():
66+
if row > len(con_dic):
67+
break
68+
col = 0
69+
worksheet.write(row,col,k,normal_format)
70+
col+=1
71+
worksheet.write(row,col,v,normal_format)
72+
row+=1
73+
print('write context success')
74+
75+
#关闭excel
76+
def workbook_close(self,workbook):
77+
workbook.close()
78+
79+
if __name__ == '__main__':
80+
print('This is create excel mode')

51BLOG/geturl3.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/bin/env python
2+
# -*- coding:utf-8 -*-
3+
4+
import requests
5+
from bs4 import BeautifulSoup
6+
7+
class get_urldic:
8+
#获取搜索关键字
9+
def get_url(self):
10+
urlList = []
11+
first_url = 'http://blog.51cto.com/search/result?q='
12+
after_url = '&type=&page='
13+
try:
14+
search = input("Please input search name:")
15+
page = int(input("Please input page:"))
16+
except Exception as e:
17+
print('Input error:',e)
18+
exit()
19+
for num in range(1,page+1):
20+
url = first_url + search + after_url + str(num)
21+
urlList.append(url)
22+
print("Please wait....")
23+
return urlList,search
24+
25+
#获取网页文件
26+
def get_html(self,urlList):
27+
response_list = []
28+
for r_num in urlList:
29+
request = requests.get(r_num)
30+
response = request.content
31+
response_list.append(response)
32+
return response_list
33+
34+
#获取blog_name和blog_url
35+
def get_soup(self,html_doc):
36+
result = {}
37+
for g_num in html_doc:
38+
soup = BeautifulSoup(g_num,'html.parser')
39+
context = soup.find_all('a',class_='m-1-4 fl')
40+
for i in context:
41+
title=i.get_text()
42+
result[title.strip()]=i['href']
43+
return result
44+
45+
46+
47+
if __name__ == '__main__':
48+
blog = get_urldic()
49+
urllist, search = blog.get_url()
50+
html_doc = blog.get_html(urllist)
51+
result = blog.get_soup(html_doc)
52+
for k,v in result.items():
53+
print('search blog_name is:%s,blog_url is:%s' % (k,v))

51BLOG/main.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/env python
2+
# -*- coding:utf-8 -*-
3+
4+
5+
import geturl3
6+
import getexcel3
7+
8+
#获取url字典
9+
def get_dic():
10+
blog = geturl3.get_urldic()
11+
urllist, search = blog.get_url()
12+
html_doc = blog.get_html(urllist)
13+
result = blog.get_soup(html_doc)
14+
return result,search
15+
16+
#写入excle
17+
def write_excle(urldic,search):
18+
excle = getexcel3.create_excle()
19+
workbook, worksheet = excle.create_workbook(search)
20+
excle.col_row(worksheet)
21+
merge_format, name_format, normal_format = excle.shell_format(workbook)
22+
excle.write_title(worksheet,search,merge_format)
23+
excle.write_tag(worksheet,name_format)
24+
excle.write_context(worksheet,urldic,normal_format)
25+
excle.workbook_close(workbook)
26+
27+
def main():
28+
url_dic ,search_name = get_dic()
29+
write_excle(url_dic,search_name)
30+
31+
if __name__ == '__main__':
32+
main()

DYTT8/getexceldytt.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#!/bin/env python
2+
# -*- coding:utf-8 -*-
3+
# @Author : kaliarch
4+
5+
import xlsxwriter
6+
7+
class create_excle:
8+
def __init__(self):
9+
self.tag_list = ["movie_name", "movie_url"]
10+
self.info = "information"
11+
12+
def create_workbook(self,search=" "):
13+
excle_name = search + '.xlsx'
14+
#定义excle名称
15+
workbook = xlsxwriter.Workbook(excle_name)
16+
worksheet_M = workbook.add_worksheet(search)
17+
worksheet_info = workbook.add_worksheet(self.info)
18+
print('create %s....' % excle_name)
19+
return workbook,worksheet_M,worksheet_info
20+
21+
def col_row(self,worksheet):
22+
worksheet.set_column('A:A', 12)
23+
worksheet.set_row(0, 17)
24+
worksheet.set_column('A:A',58)
25+
worksheet.set_column('B:B', 58)
26+
27+
def shell_format(self,workbook):
28+
#表头格式
29+
merge_format = workbook.add_format({
30+
'bold': 1,
31+
'border': 1,
32+
'align': 'center',
33+
'valign': 'vcenter',
34+
'fg_color': '#FAEBD7'
35+
})
36+
#标题格式
37+
name_format = workbook.add_format({
38+
'bold': 1,
39+
'border': 1,
40+
'align': 'center',
41+
'valign': 'vcenter',
42+
'fg_color': '#E0FFFF'
43+
})
44+
#正文格式
45+
normal_format = workbook.add_format({
46+
'align': 'center',
47+
})
48+
return merge_format,name_format,normal_format
49+
50+
#写入title和列名
51+
def write_title(self,worksheet,search,merge_format):
52+
title = search + "搜索结果"
53+
worksheet.merge_range('A1:B1', title, merge_format)
54+
print('write title success')
55+
56+
def write_tag(self,worksheet,name_format):
57+
tag_row = 1
58+
tag_col = 0
59+
for num in self.tag_list:
60+
worksheet.write(tag_row,tag_col,num,name_format)
61+
tag_col += 1
62+
print('write tag success')
63+
64+
#写入内容
65+
def write_context(self,worksheet,con_dic,normal_format):
66+
row = 2
67+
for k,v in con_dic.items():
68+
if row > len(con_dic):
69+
break
70+
col = 0
71+
worksheet.write(row,col,k,normal_format)
72+
col+=1
73+
worksheet.write(row,col,v,normal_format)
74+
row+=1
75+
print('write context success')
76+
77+
def write_info(self,worksheet_info,info_list,normal_format):
78+
row = 1
79+
for infomsg in info_list:
80+
for num in range(0,len(infomsg)):
81+
worksheet_info.write(row,num,infomsg[num],normal_format)
82+
num += 1
83+
row += 1
84+
85+
print("wirte info success")
86+
87+
88+
#关闭excel
89+
def workbook_close(self,workbook):
90+
workbook.close()
91+
92+
93+
if __name__ == '__main__':
94+
print('This is create excel mode')

DYTT8/geturldytt.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#!/bin/env python
2+
# -*- coding:utf-8 -*-
3+
from urllib import parse
4+
import requests
5+
from bs4 import BeautifulSoup
6+
#http://s.dydytt.net/plus/search.php?keyword=%BF%C6%BB%C3&searchtype=titlekeyword&channeltype=0&orderby=&kwtype=0&pagesize=10&typeid=0&TotalResult=279&PageNo=4
7+
class get_urldic:
8+
def __init__(self):
9+
self.first_url = 'http://s.dydytt.net/plus/search.php?'
10+
self.second_url = '&searchtype=titlekeyword&channeltype=0&orderby=&kwtype=0&pagesize=10&typeid=0&TotalResult=279&PageNo='
11+
self.info_url = 'http://s.dydytt.net'
12+
#获取搜索关键字
13+
def get_url(self):
14+
urlList = []
15+
# first_url = 'http://s.dydytt.net/plus/search.php?'
16+
# second_url = '&searchtype=titlekeyword&channeltype=0&orderby=&kwtype=0&pagesize=10&typeid=0&TotalResult=279&PageNo='
17+
try:
18+
search = input("Please input search name:")
19+
dic = {'keyword':search}
20+
keyword_dic = parse.urlencode(dic,encoding='gb2312')
21+
page = int(input("Please input page:"))
22+
except Exception as e:
23+
print('Input error:',e)
24+
exit()
25+
for num in range(1,page+1):
26+
url = self.first_url + str(keyword_dic) + self.second_url + str(num)
27+
urlList.append(url)
28+
print("Please wait....")
29+
print(urlList)
30+
return urlList,search
31+
32+
#获取网页文件
33+
def get_html(self,urlList):
34+
response_list = []
35+
for r_num in urlList:
36+
request = requests.get(r_num)
37+
response = request.content.decode('gbk','ignore').encode('utf-8')
38+
response_list.append(response)
39+
return response_list
40+
41+
#获取blog_name和blog_url
42+
def get_soup(self,html_doc):
43+
result = {}
44+
for g_num in html_doc:
45+
soup = BeautifulSoup(g_num,'html.parser')
46+
context = soup.find_all('td', width="55%")
47+
for i in context:
48+
title=i.get_text()
49+
result[title.strip()]=self.info_url + i.b.a['href']
50+
return result
51+
52+
def get_info(self,info_dic):
53+
info_tmp = []
54+
for k,v in info_dic.items():
55+
print(v)
56+
response = requests.get(v)
57+
new_response = response.content.decode('gbk').encode('utf-8')
58+
soup = BeautifulSoup(new_response, 'html.parser')
59+
info_dic = soup.find_all('div', class_="co_content8")
60+
info_list1= []
61+
for context in info_dic:
62+
result = list(context.get_text().split())
63+
for i in range(0, len(result)):
64+
if '发布' in result[i]:
65+
public = result[i]
66+
info_list1.append(public)
67+
elif "豆瓣" in result[i]:
68+
douban = result[i] + result[i+1]
69+
info_list1.append(douban)
70+
elif "【下载地址】" in result[i]:
71+
download = result[i] + result[i+1]
72+
info_list1.append(download)
73+
else:
74+
pass
75+
info_tmp.append(info_list1)
76+
return info_tmp
77+
78+
if __name__ == '__main__':
79+
blog = get_urldic()
80+
urllist, search = blog.get_url()
81+
html_doc = blog.get_html(urllist)
82+
result = blog.get_soup(html_doc)
83+
for k,v in result.items():
84+
print('search blog_name is:%s,blog_url is:%s' % (k,v))
85+
info_list = blog.get_info(result)
86+
for list in info_list:
87+
print(list)

DYTT8/main.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/env python
2+
# -*- coding:utf-8 -*-
3+
4+
5+
import geturldytt
6+
import getexceldytt
7+
8+
#获取url字典
9+
def get_dic():
10+
blog = geturldytt.get_urldic()
11+
urllist, search = blog.get_url()
12+
html_doc = blog.get_html(urllist)
13+
result = blog.get_soup(html_doc)
14+
info_list= blog.get_info(result)
15+
return result,search,info_list
16+
17+
#写入excle
18+
def write_excle(urldic,search,info_list):
19+
excle = getexceldytt.create_excle()
20+
workbook, worksheet, worksheet_info = excle.create_workbook(search)
21+
excle.col_row(worksheet)
22+
merge_format, name_format, normal_format = excle.shell_format(workbook)
23+
excle.write_title(worksheet,search,merge_format)
24+
excle.write_tag(worksheet,name_format)
25+
excle.write_context(worksheet,urldic,normal_format)
26+
excle.write_info(worksheet_info,info_list,normal_format)
27+
excle.workbook_close(workbook)
28+
29+
def main():
30+
url_dic ,search_name, info_list = get_dic()
31+
write_excle(url_dic,search_name,info_list)
32+
33+
if __name__ == '__main__':
34+
main()

0 commit comments

Comments
 (0)