Skip to content

Commit 680de68

Browse files
committed
add sa_blog
1 parent 2058ab3 commit 680de68

2 files changed

Lines changed: 106 additions & 0 deletions

File tree

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,8 @@ blog:[Python多线程爬图&Scrapy框架爬图](http://blog.51cto.com/kaliarch
5959
> Python操作CVM
6060
6161
blog:[Python操作CVM](http://blog.51cto.com/kaliarch/2165000)
62+
63+
## 利用Python批量保存51CTO博客
64+
> 利用Python批量保存51CTO博客
65+
66+
blog:[利用Python批量保存51CTO博客](http://blog.51cto.com/kaliarch/2301359)

save_blog/save_blog.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#!/bin/env python
2+
# -*- coding:utf-8 -*-
3+
# _auth:kaliarch
4+
5+
import requests
6+
import time
7+
from bs4 import BeautifulSoup
8+
from selenium import webdriver
9+
10+
11+
class BlogSave():
12+
# 定义headers字段
13+
headers = {
14+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.89 Safari/537.36"
15+
}
16+
17+
def __init__(self,blog_name,page_number,login_user_name,login_passwd):
18+
self.login_url = 'http://home.51cto.com/index'
19+
# 博客用户名
20+
self.blog_name = blog_name
21+
# 需要保存的博客多少页
22+
self.page_number = page_number
23+
# 登陆的用户
24+
self.login_user_name = login_user_name
25+
# 登陆的密码
26+
self.login_passwd = login_passwd
27+
# 本地的chreomedriver驱动
28+
self.chromedirve = 'D:\chromedriver.exe'
29+
# blog 导入url
30+
self.blog_save_url = 'http://blog.51cto.com/blogger/publish/'
31+
32+
33+
def get_urldict(self):
34+
"""
35+
爬去用户文章的url
36+
:param pagenumber:
37+
:return: urllist
38+
"""
39+
content_dict = {}
40+
scrapy_urllist = ["http://blog.51cto.com/" + str(self.blog_name) + "/p" + str(page) for page in
41+
range(1, int(self.page_number) + 1)]
42+
for scrapy_url in scrapy_urllist:
43+
response = requests.get(scrapy_url, headers=BlogSave.headers)
44+
soup = BeautifulSoup(response.content, 'lxml', from_encoding='utf-8')
45+
title_list = soup.find_all('a', class_='tit')
46+
47+
for content in title_list:
48+
# 获取url
49+
url = content['href']
50+
title_soup = BeautifulSoup(requests.get(url, headers=BlogSave.headers).content, 'lxml', from_encoding='utf-8')
51+
title = title_soup.find_all('h1', class_='artical-title')
52+
# 获取标题
53+
# print(title[0].get_text())
54+
content_dict[title[0].get_text()] = url
55+
print(title[0].get_text(),url)
56+
57+
return content_dict
58+
59+
60+
def save_blog(self,url_list):
61+
"""
62+
通过模拟登陆保存博客文件
63+
:return:
64+
"""
65+
browser = webdriver.Chrome(self.chromedirve)
66+
# 打开url
67+
browser.get(self.login_url)
68+
time.sleep(2)
69+
# 登陆
70+
browser.find_element_by_id('loginform-username').send_keys(self.login_user_name)
71+
browser.find_element_by_id('loginform-password').send_keys(self.login_passwd)
72+
browser.find_element_by_name('login-button').click()
73+
time.sleep(1)
74+
for url in url_list:
75+
browser.get(url)
76+
time.sleep(1)
77+
try:
78+
browser.find_element_by_xpath('//*[@id="blogEditor-box"]/div[1]/a[14]').click()
79+
time.sleep(2)
80+
except Exception as e:
81+
with open('fail.log','a') as f:
82+
f.write(url + str(e))
83+
84+
def run(self):
85+
# 获取标题和url字典
86+
content_dict = self.get_urldict()
87+
# 获取url列表
88+
id_list = []
89+
for value in content_dict.values():
90+
id_list.append(str(value).split('/')[-1])
91+
result_list = [ self.blog_save_url + str(id) for id in id_list ]
92+
print("result_list:",result_list)
93+
self.save_blog(result_list)
94+
95+
if __name__ == '__main__':
96+
# blogOper = BlogSave('kaliarch',1)
97+
# dict = blogOper.get_urldict()
98+
# value_list = [ value for value in dict.values()]
99+
# print(value_list)
100+
blogOper = BlogSave(blog_name='kaliarch',page_number=5,login_user_name='xxxxxxxxxxxxx@163.com',login_passwd='qxxxxxxxxx')
101+
blogOper.run()

0 commit comments

Comments
 (0)