Skip to content

Commit ff33b3a

Browse files
committed
python 学习记录
1 parent dcf8fc0 commit ff33b3a

62 files changed

Lines changed: 4480 additions & 0 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

py_taobaomm_pic/.idea/Py_taobaomm_pic.iml

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

py_taobaomm_pic/.idea/misc.xml

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

py_taobaomm_pic/.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

py_taobaomm_pic/.idea/workspace.xml

Lines changed: 330 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

py_taobaomm_pic/taobaomm.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
# -*- coding: utf-8 -*-
2+
#---------------------------------------
3+
# 程序:本地视频--Python培训之美眉图片下载爬虫--小甲鱼
4+
# 网址:
5+
# 日期:2016-08-29
6+
# 语言:Python 2.7
7+
#---------------------------------------
8+
import urllib2
9+
import urllib
10+
import re
11+
import requests
12+
13+
14+
def mkdir(path):
15+
# 引入模块
16+
import os
17+
18+
# 去除首位空格
19+
path = path.strip()
20+
# 去除尾部 \ 符号
21+
path = path.rstrip("\\")
22+
23+
# 判断路径是否存在
24+
# 存在 True
25+
# 不存在 False
26+
isExists = os.path.exists(path)
27+
28+
# 判断结果
29+
if not isExists:
30+
# 如果不存在则创建目录
31+
print path + ' 创建成功'
32+
# 创建目录操作函数
33+
os.makedirs(path)
34+
return True
35+
else:
36+
# 如果目录存在则不创建,并提示目录已存在
37+
print path + ' 目录已存在'
38+
return False
39+
40+
41+
# # 定义要创建的目录
42+
#mkpath = "d:\\qttc\\web\\"
43+
# # 调用函数
44+
# mkdir(mkpath)
45+
46+
47+
48+
mmurl="https://mm.taobao.com/json/request_top_list.htm?type=0&page="
49+
i=0
50+
ph=-1
51+
temp='''<img src="'''
52+
while i<20:
53+
url=mmurl+str(i)
54+
# print url
55+
up=urllib2.urlopen(url)
56+
cont=up.read().decode("gbk")
57+
# print '-------11111------------------'
58+
#print cont
59+
head="<img src="
60+
tail=".jpg"
61+
ph=cont.find(head)
62+
pj=cont.find(tail,ph+1)
63+
# print 'https:'+cont[ph+len(temp):pj+len(tail)]
64+
# print '--------2222------------'
65+
# ahref='''<a href="'''
66+
# target='''target="'''
67+
# pa=cont.find(ahref)
68+
# pb=cont.find(target,pa)
69+
# print pa,pb
70+
# print 'https:'+cont[pa+len(ahref):pb-2]
71+
#以上是,获取图片上的连接,但是进不去,换成下面获取名字的连接
72+
73+
#######################################################
74+
ahref2='''model_card.htm?user_id='''
75+
target2='''" target='''
76+
pa2=cont.find(ahref2)
77+
pb2=cont.find(target2,pa2)
78+
# print pa2,pb2
79+
# print cont[pa2+len(ahref2):pb2]
80+
#获取编号
81+
#modelurl='https:'+cont[pa2+len(ahref2):pb2]+'&is_coment=false'
82+
# modelurl='https://mm.taobao.com/self/model_info.htm?user_id='+cont[pa2+len(ahref2):pb2]+'&is_coment=false'
83+
#modelurl='https://mm.taobao.com/self/model_info.htm?user_id=687471686&is_coment=false'
84+
#查看源代码,再一次转换地址:url:"/self/info/model_info_show.htm?user_id="+687471686
85+
#直接修改添加,结合id(是放到代码里面的,JavaScript调用,框架?)
86+
user_id=cont[pa2+len(ahref2):pb2]
87+
modelurl = 'https://mm.taobao.com/self/info/model_info_show.htm?user_id='+user_id
88+
# print modelurl
89+
mup=urllib2.urlopen(modelurl)
90+
mcont=mup.read().decode("gbk")
91+
# print '-----------3333------------'
92+
# print modelurl
93+
# print mcont
94+
95+
#这里获取的只是中间地址,淘宝第二次跳转(截取--域名地址://mm.taobao.com/tyy6160)
96+
#尝试用第二种方法,re.findall()--还是选search(),找一次就结束
97+
# print '------------444---------'
98+
#yuming_url = re.findall('<span>(.*?)</span>', mcont, re.S)
99+
#yuming_url = 'https:'+re.search('mm-p-domain-info.*?<span>(.*?)</span>', mcont, re.S).group(1)
100+
mod_id1=re.search('mm-p-domain-info.*?<span>(.*?)</span>', mcont, re.S).group(1)
101+
yuming_url = 'https:'+mod_id1
102+
###中文“域名地址:”去搜索,老是找不到 # < label > 域名地址: < / label > < span >
103+
# nam=re.search('</label><span>(.*?)</span>', mcont, re.S).group(1)
104+
mup2=urllib2.urlopen(yuming_url)
105+
mcont2=mup2.read().decode("gbk")
106+
#yuming_url='https://mm.taobao.com/menyastrong'
107+
print yuming_url
108+
# print mcont2
109+
#总算找到淘女郎照片真正网址
110+
#######################################################
111+
112+
#####下面寻找里面的照片################################################
113+
################这一段代码,可以了,但是怎么获取批量的?还是改用re.findall()#############
114+
# # print '-----------555---------'
115+
# #ahref3 = '''margin: 10.0px;" src="'''
116+
# #不行,有的格式变了margin: 10.0px;float: none;" src="
117+
# ahref3 = ''';" src="'''
118+
# target3 = '''>'''
119+
# ###如果觉得太长了,也可以做2次查找mcont2[pa4:]后面留空,表示到最后一个字符
120+
# pa4 = mcont2.find(ahref3)
121+
# if pa4==-1 :
122+
# print '该淘女郎没有图片!'
123+
# else:
124+
# pb4 = mcont2.find(target3, pa4-1)
125+
# print 'pa4='+str(pa4), 'pb4='+str(pb4)
126+
# print 'https:'+mcont2[pa4:pb4-1]
127+
################这一段代码,可以了,但是怎么获取批量的?,还是改用re.findall()#############
128+
#########提取照片信息,先抓大后抓小
129+
pic_text = re.findall(';" src="(.*?)"/>', mcont2, re.S)
130+
j=0
131+
k= str(user_id)
132+
mkdir(k)
133+
for each in pic_text:
134+
print 'now downloading:' + 'https:'+each
135+
#######方法1--成功###################
136+
# pic = requests.get('https:'+each)
137+
# p=k+'\\' +str(user_id)+'-'+str(j+1) + '.jpg'
138+
# # print p
139+
# fp = open(p, 'wb')
140+
# fp.write(pic.content)
141+
#######方法1--成功###################
142+
#######方法2 urllib.urlretrieve(url,路径)--成功###################
143+
urllib.urlretrieve('https:'+each,k+'\\' +str(user_id)+'-'+str(j+1)+'.jpg')
144+
###但是,加不进目录路径?
145+
#######方法2 urllib.urlretrieve(url,路径)--成功###################
146+
147+
#######方法2###################
148+
if j==99:
149+
break
150+
j += 1
151+
152+
#####第2层循环#############################
153+
# modelurl_pic = 'https:'+mcont2[pa4+len(ahref3):pb4-1]
154+
# mup4 = urllib2.urlopen(modelurl_pic)
155+
# mcont4 = mup4.read()
156+
# print mcont4
157+
#####第一层循环#############################
158+
i+=1
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# -*- coding: utf-8 -*-
2+
#---------------------------------------
3+
# 程序:本地视频--Python培训之美眉图片下载爬虫--小甲鱼
4+
# 说明:taobaomm_v1.0-调试通了.py
5+
# 用途:下载图片,创建目录
6+
# 网址:
7+
# 日期:2016-08-29
8+
# 语言:Python 2.7
9+
#---------------------------------------
10+
import urllib2
11+
import urllib
12+
import re
13+
import requests
14+
15+
16+
def mkdir(path):
17+
# 引入模块
18+
import os
19+
20+
# 去除首位空格
21+
path = path.strip()
22+
# 去除尾部 \ 符号
23+
path = path.rstrip("\\")
24+
25+
# 判断路径是否存在
26+
# 存在 True
27+
# 不存在 False
28+
isExists = os.path.exists(path)
29+
30+
# 判断结果
31+
if not isExists:
32+
# 如果不存在则创建目录
33+
print path + ' 创建成功'
34+
# 创建目录操作函数
35+
os.makedirs(path)
36+
return True
37+
else:
38+
# 如果目录存在则不创建,并提示目录已存在
39+
print path + ' 目录已存在'
40+
return False
41+
42+
43+
# # 定义要创建的目录
44+
#mkpath = "d:\\qttc\\web\\"
45+
# # 调用函数
46+
# mkdir(mkpath)
47+
48+
49+
50+
mmurl="https://mm.taobao.com/json/request_top_list.htm?type=0&page="
51+
i=0
52+
ph=-1
53+
temp='''<img src="'''
54+
while i<20:
55+
url=mmurl+str(i)
56+
# print url
57+
up=urllib2.urlopen(url)
58+
cont=up.read().decode("gbk")
59+
# print '-------11111------------------'
60+
#print cont
61+
head="<img src="
62+
tail=".jpg"
63+
ph=cont.find(head)
64+
pj=cont.find(tail,ph+1)
65+
# print 'https:'+cont[ph+len(temp):pj+len(tail)]
66+
# print '--------2222------------'
67+
# ahref='''<a href="'''
68+
# target='''target="'''
69+
# pa=cont.find(ahref)
70+
# pb=cont.find(target,pa)
71+
# print pa,pb
72+
# print 'https:'+cont[pa+len(ahref):pb-2]
73+
#以上是,获取图片上的连接,但是进不去,换成下面获取名字的连接
74+
75+
#######################################################
76+
ahref2='''model_card.htm?user_id='''
77+
target2='''" target='''
78+
pa2=cont.find(ahref2)
79+
pb2=cont.find(target2,pa2)
80+
# print pa2,pb2
81+
# print cont[pa2+len(ahref2):pb2]
82+
#获取编号
83+
#modelurl='https:'+cont[pa2+len(ahref2):pb2]+'&is_coment=false'
84+
# modelurl='https://mm.taobao.com/self/model_info.htm?user_id='+cont[pa2+len(ahref2):pb2]+'&is_coment=false'
85+
#modelurl='https://mm.taobao.com/self/model_info.htm?user_id=687471686&is_coment=false'
86+
#查看源代码,再一次转换地址:url:"/self/info/model_info_show.htm?user_id="+687471686
87+
#直接修改添加,结合id(是放到代码里面的,JavaScript调用,框架?)
88+
user_id=cont[pa2+len(ahref2):pb2]
89+
modelurl = 'https://mm.taobao.com/self/info/model_info_show.htm?user_id='+user_id
90+
# print modelurl
91+
mup=urllib2.urlopen(modelurl)
92+
mcont=mup.read().decode("gbk")
93+
# print '-----------3333------------'
94+
# print modelurl
95+
# print mcont
96+
97+
#这里获取的只是中间地址,淘宝第二次跳转(截取--域名地址://mm.taobao.com/tyy6160)
98+
#尝试用第二种方法,re.findall()--还是选search(),找一次就结束
99+
# print '------------444---------'
100+
#yuming_url = re.findall('<span>(.*?)</span>', mcont, re.S)
101+
#yuming_url = 'https:'+re.search('mm-p-domain-info.*?<span>(.*?)</span>', mcont, re.S).group(1)
102+
mod_id1=re.search('mm-p-domain-info.*?<span>(.*?)</span>', mcont, re.S).group(1)
103+
yuming_url = 'https:'+mod_id1
104+
###中文“域名地址:”去搜索,老是找不到 # < label > 域名地址: < / label > < span >
105+
# nam=re.search('</label><span>(.*?)</span>', mcont, re.S).group(1)
106+
mup2=urllib2.urlopen(yuming_url)
107+
mcont2=mup2.read().decode("gbk")
108+
#yuming_url='https://mm.taobao.com/menyastrong'
109+
print yuming_url
110+
# print mcont2
111+
#总算找到淘女郎照片真正网址
112+
#######################################################
113+
114+
#####下面寻找里面的照片################################################
115+
################这一段代码,可以了,但是怎么获取批量的?还是改用re.findall()#############
116+
# # print '-----------555---------'
117+
# #ahref3 = '''margin: 10.0px;" src="'''
118+
# #不行,有的格式变了margin: 10.0px;float: none;" src="
119+
# ahref3 = ''';" src="'''
120+
# target3 = '''>'''
121+
# ###如果觉得太长了,也可以做2次查找mcont2[pa4:]后面留空,表示到最后一个字符
122+
# pa4 = mcont2.find(ahref3)
123+
# if pa4==-1 :
124+
# print '该淘女郎没有图片!'
125+
# else:
126+
# pb4 = mcont2.find(target3, pa4-1)
127+
# print 'pa4='+str(pa4), 'pb4='+str(pb4)
128+
# print 'https:'+mcont2[pa4:pb4-1]
129+
################这一段代码,可以了,但是怎么获取批量的?,还是改用re.findall()#############
130+
#########提取照片信息,先抓大后抓小
131+
pic_text = re.findall(';" src="(.*?)"/>', mcont2, re.S)
132+
j=0
133+
k= str(user_id)
134+
mkdir(k)
135+
for each in pic_text:
136+
print 'now downloading:' + 'https:'+each
137+
#######方法1--成功###################
138+
# pic = requests.get('https:'+each)
139+
# p=k+'\\' +str(user_id)+'-'+str(j+1) + '.jpg'
140+
# # print p
141+
# fp = open(p, 'wb')
142+
# fp.write(pic.content)
143+
#######方法1--成功###################
144+
#######方法2 urllib.urlretrieve(url,路径)--成功###################
145+
urllib.urlretrieve('https:'+each,k+'\\' +str(user_id)+'-'+str(j+1)+'.jpg')
146+
###但是,加不进目录路径?
147+
#######方法2 urllib.urlretrieve(url,路径)--成功###################
148+
149+
#######方法2###################
150+
if j==99:
151+
break
152+
j += 1
153+
154+
#####第2层循环#############################
155+
# modelurl_pic = 'https:'+mcont2[pa4+len(ahref3):pb4-1]
156+
# mup4 = urllib2.urlopen(modelurl_pic)
157+
# mcont4 = mup4.read()
158+
# print mcont4
159+
#####第一层循环#############################
160+
i+=1

py_youtube_study/.idea/dictionaries/zhangcheng.xml

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

py_youtube_study/.idea/misc.xml

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

py_youtube_study/.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)