|
| 1 | +#!/bin/env python |
| 2 | +# -*- coding:utf-8 -*- |
| 3 | +# _auth:kaliarch |
| 4 | + |
| 5 | +import requests |
| 6 | +import time |
| 7 | +from bs4 import BeautifulSoup |
| 8 | +from selenium import webdriver |
| 9 | + |
| 10 | + |
| 11 | +class BlogSave(): |
| 12 | + # 定义headers字段 |
| 13 | + headers = { |
| 14 | + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.89 Safari/537.36" |
| 15 | + } |
| 16 | + |
| 17 | + def __init__(self,blog_name,page_number,login_user_name,login_passwd): |
| 18 | + self.login_url = 'http://home.51cto.com/index' |
| 19 | + # 博客用户名 |
| 20 | + self.blog_name = blog_name |
| 21 | + # 需要保存的博客多少页 |
| 22 | + self.page_number = page_number |
| 23 | + # 登陆的用户 |
| 24 | + self.login_user_name = login_user_name |
| 25 | + # 登陆的密码 |
| 26 | + self.login_passwd = login_passwd |
| 27 | + # 本地的chreomedriver驱动 |
| 28 | + self.chromedirve = 'D:\chromedriver.exe' |
| 29 | + # blog 导入url |
| 30 | + self.blog_save_url = 'http://blog.51cto.com/blogger/publish/' |
| 31 | + |
| 32 | + |
| 33 | + def get_urldict(self): |
| 34 | + """ |
| 35 | + 爬去用户文章的url |
| 36 | + :param pagenumber: |
| 37 | + :return: urllist |
| 38 | + """ |
| 39 | + content_dict = {} |
| 40 | + scrapy_urllist = ["http://blog.51cto.com/" + str(self.blog_name) + "/p" + str(page) for page in |
| 41 | + range(1, int(self.page_number) + 1)] |
| 42 | + for scrapy_url in scrapy_urllist: |
| 43 | + response = requests.get(scrapy_url, headers=BlogSave.headers) |
| 44 | + soup = BeautifulSoup(response.content, 'lxml', from_encoding='utf-8') |
| 45 | + title_list = soup.find_all('a', class_='tit') |
| 46 | + |
| 47 | + for content in title_list: |
| 48 | + # 获取url |
| 49 | + url = content['href'] |
| 50 | + title_soup = BeautifulSoup(requests.get(url, headers=BlogSave.headers).content, 'lxml', from_encoding='utf-8') |
| 51 | + title = title_soup.find_all('h1', class_='artical-title') |
| 52 | + # 获取标题 |
| 53 | + # print(title[0].get_text()) |
| 54 | + content_dict[title[0].get_text()] = url |
| 55 | + print(title[0].get_text(),url) |
| 56 | + |
| 57 | + return content_dict |
| 58 | + |
| 59 | + |
| 60 | + def save_blog(self,url_list): |
| 61 | + """ |
| 62 | + 通过模拟登陆保存博客文件 |
| 63 | + :return: |
| 64 | + """ |
| 65 | + browser = webdriver.Chrome(self.chromedirve) |
| 66 | + # 打开url |
| 67 | + browser.get(self.login_url) |
| 68 | + time.sleep(2) |
| 69 | + # 登陆 |
| 70 | + browser.find_element_by_id('loginform-username').send_keys(self.login_user_name) |
| 71 | + browser.find_element_by_id('loginform-password').send_keys(self.login_passwd) |
| 72 | + browser.find_element_by_name('login-button').click() |
| 73 | + time.sleep(1) |
| 74 | + for url in url_list: |
| 75 | + browser.get(url) |
| 76 | + time.sleep(1) |
| 77 | + try: |
| 78 | + browser.find_element_by_xpath('//*[@id="blogEditor-box"]/div[1]/a[14]').click() |
| 79 | + time.sleep(2) |
| 80 | + except Exception as e: |
| 81 | + with open('fail.log','a') as f: |
| 82 | + f.write(url + str(e)) |
| 83 | + |
| 84 | + def run(self): |
| 85 | + # 获取标题和url字典 |
| 86 | + content_dict = self.get_urldict() |
| 87 | + # 获取url列表 |
| 88 | + id_list = [] |
| 89 | + for value in content_dict.values(): |
| 90 | + id_list.append(str(value).split('/')[-1]) |
| 91 | + result_list = [ self.blog_save_url + str(id) for id in id_list ] |
| 92 | + print("result_list:",result_list) |
| 93 | + self.save_blog(result_list) |
| 94 | + |
| 95 | +if __name__ == '__main__': |
| 96 | + # blogOper = BlogSave('kaliarch',1) |
| 97 | + # dict = blogOper.get_urldict() |
| 98 | + # value_list = [ value for value in dict.values()] |
| 99 | + # print(value_list) |
| 100 | + blogOper = BlogSave(blog_name='kaliarch',page_number=5,login_user_name='xxxxxxxxxxxxx@163.com',login_passwd='qxxxxxxxxx') |
| 101 | + blogOper.run() |
0 commit comments