Skip to content

Commit da5aeda

Browse files
committed
0013: 爬链接中的图片
1 parent 0037e83 commit da5aeda

2 files changed

Lines changed: 78 additions & 1 deletion

File tree

xyjxyf/show_me_the_code.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,16 @@ def replace_sensitive_words(sensitive_file=None, input_string=None):
270270
print(input_string)
271271

272272
# 第 0013 题: 用 Python 写一个爬图片的程序
273+
from tools import geturlimgs
274+
275+
def get_url_imgs(url=None):
276+
if url is None:
277+
return None
278+
279+
tmp = geturlimgs.geturlimgs()
280+
tmp.get_imgs(url, "/Users/xieyajie/Desktop/Python/ShowMeCode/xyjxyf/0013/")
281+
282+
273283

274284
# 第 0014 题: 纯文本文件 student.txt为学生信息, 写到 student.xls 文件中
275285
# 第 0015 题: 纯文本文件 city.txt为城市信息,写到 city.xls 文件中
@@ -464,7 +474,7 @@ def write_numbers_to_xml(list=None, to_path=None):
464474
# print("代码行数:%i\n注释行数:%i\n空行行数:%i" % (code, note, blank_line))
465475

466476
# 0008
467-
get_html_context("http://blog.bccn.net")
477+
# get_html_context("http://blog.bccn.net")
468478

469479
# 0009
470480
# get_html_links("http://blog.bccn.net")
@@ -479,6 +489,7 @@ def write_numbers_to_xml(list=None, to_path=None):
479489
# replace_sensitive_words("./0011/0011.txt", "haha, 北京不错")
480490

481491
# 0013
492+
get_url_imgs("http://www.ivsky.com/tupian/beijing_t1542/index_2.html")
482493

483494
# 0014
484495
# dictxt_to_xls("./0014/student.txt")

xyjxyf/tools/geturlimgs.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# encoding = utf-8
2+
3+
import socket, os
4+
from urllib import request
5+
from bs4 import BeautifulSoup
6+
7+
class geturlimgs(object):
8+
9+
def __index__(self, to_dir=None):
10+
self.to_dir = None
11+
12+
# 伪装浏览器,以免被封
13+
def user_agent(self, url):
14+
req_header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
15+
req_timeout = 20
16+
try:
17+
req = request.Request(url,None,req_header)
18+
html = request.urlopen(req,None,req_timeout)
19+
except request.URLError as e:
20+
print(e.message)
21+
except socket.timeout as e:
22+
# user_agent(url)
23+
print("timeout")
24+
25+
return html
26+
27+
def get_img_links(self, url=None):
28+
if url is None or len(url) == 0:
29+
return None
30+
html = self.user_agent(url)
31+
soup = BeautifulSoup(html, "lxml")
32+
33+
count = 0
34+
links = []
35+
items = soup.find_all('img')
36+
for item in items:
37+
link = item.get('src')
38+
links.append(link)
39+
40+
return links
41+
42+
def download_imgs(self, links, to_dir):
43+
if links is None or len(links) == 0:
44+
return
45+
46+
if not os.path.exists(to_dir):
47+
os.makedirs(to_dir)
48+
49+
if not to_dir.endswith('/'):
50+
to_dir = to_dir + '/'
51+
52+
index = 0
53+
for url in links:
54+
end = os.path.splitext(url)[1]
55+
if len(end) == 0:
56+
end = ".jpg"
57+
img_path = to_dir + '%i%s' % (index, end)
58+
image = request.urlretrieve(url, img_path)
59+
index = index + 1
60+
61+
def get_imgs(self, url=None, to_dir=None):
62+
if url is None or to_dir is None:
63+
return None
64+
65+
links = self.get_img_links(url)
66+
return self.download_imgs(links, to_dir)

0 commit comments

Comments
 (0)