0013: 爬链接中的图片

xieyajie · xieyajie · commit da5aedaeda77 · 2016-03-26T21:59:59.000+08:00
diff --git a/xyjxyf/show_me_the_code.py b/xyjxyf/show_me_the_code.py
@@ -270,6 +270,16 @@ def replace_sensitive_words(sensitive_file=None, input_string=None):
     print(input_string)
 
 # 第 0013 题： 用 Python 写一个爬图片的程序
+from tools import geturlimgs
+
+def get_url_imgs(url=None):
+    if url is None:
+        return None
+
+    tmp = geturlimgs.geturlimgs()
+    tmp.get_imgs(url, "/Users/xieyajie/Desktop/Python/ShowMeCode/xyjxyf/0013/")
+
+
 
 # 第 0014 题： 纯文本文件 student.txt为学生信息, 写到 student.xls 文件中
 # 第 0015 题： 纯文本文件 city.txt为城市信息,写到 city.xls 文件中
@@ -464,7 +474,7 @@ def write_numbers_to_xml(list=None, to_path=None):
     # print("代码行数:%i\n注释行数:%i\n空行行数:%i" % (code, note, blank_line))
 
     # 0008
-    get_html_context("http://blog.bccn.net")
+    # get_html_context("http://blog.bccn.net")
 
     # 0009
     # get_html_links("http://blog.bccn.net")
@@ -479,6 +489,7 @@ def write_numbers_to_xml(list=None, to_path=None):
     # replace_sensitive_words("./0011/0011.txt", "haha, 北京不错")
 
     # 0013
+    get_url_imgs("http://www.ivsky.com/tupian/beijing_t1542/index_2.html")
 
     # 0014
     # dictxt_to_xls("./0014/student.txt")
diff --git a/xyjxyf/tools/geturlimgs.py b/xyjxyf/tools/geturlimgs.py
@@ -0,0 +1,66 @@
+# encoding = utf-8
+
+import socket, os
+from urllib import request
+from bs4 import BeautifulSoup
+
+class geturlimgs(object):
+
+    def __index__(self, to_dir=None):
+        self.to_dir = None
+
+    # 伪装浏览器,以免被封
+    def user_agent(self, url):
+        req_header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
+        req_timeout = 20
+        try:
+            req = request.Request(url,None,req_header)
+            html = request.urlopen(req,None,req_timeout)
+        except request.URLError as e:
+            print(e.message)
+        except socket.timeout as e:
+            # user_agent(url)
+            print("timeout")
+
+        return html
+
+    def get_img_links(self, url=None):
+        if url is None or len(url) == 0:
+            return None
+        html = self.user_agent(url)
+        soup = BeautifulSoup(html, "lxml")
+
+        count = 0
+        links = []
+        items = soup.find_all('img')
+        for item in items:
+            link = item.get('src')
+            links.append(link)
+
+        return links
+
+    def download_imgs(self, links, to_dir):
+        if links is None or len(links) == 0:
+            return
+
+        if not os.path.exists(to_dir):
+            os.makedirs(to_dir)
+
+        if not to_dir.endswith('/'):
+            to_dir = to_dir + '/'
+
+        index = 0
+        for url in links:
+            end = os.path.splitext(url)[1]
+            if len(end) == 0:
+                end = ".jpg"
+            img_path = to_dir + '%i%s' % (index, end)
+            image = request.urlretrieve(url, img_path)
+            index = index + 1
+
+    def get_imgs(self, url=None, to_dir=None):
+        if url is None or to_dir is None:
+            return None
+
+        links = self.get_img_links(url)
+        return self.download_imgs(links, to_dir)