Skip to content

Commit 7ebd51c

Browse files
authored
Add files via upload
1 parent d9bc2d8 commit 7ebd51c

2 files changed

Lines changed: 162 additions & 0 deletions

File tree

BSGS_Rent/house_data_crawler.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
import os
2+
import re
3+
import time
4+
import requests
5+
from pymongo import MongoClient
6+
from info import rent_type, city_info
7+
8+
9+
class Rent(object):
10+
"""
11+
初始化函数,获取租房类型(整租、合租)、要爬取的城市分区信息以及连接mongodb数据库
12+
"""
13+
def __init__(self):
14+
self.rent_type = rent_type
15+
self.city_info = city_info
16+
17+
host = os.environ.get('MONGODB_HOST', '127.0.0.1') # 本地数据库
18+
port = os.environ.get('MONGODB_PORT', '27017') # 数据库端口
19+
mongo_url = 'mongodb://{}:{}'.format(host, port)
20+
mongo_db = os.environ.get('MONGODB_DATABASE', 'Lianjia')
21+
client = MongoClient(mongo_url)
22+
self.db = client[mongo_db]
23+
self.db['zufang'].create_index('m_url', unique=True) # 以m端链接为主键进行去重
24+
25+
def get_data(self):
26+
"""
27+
爬取不同租房类型、不同城市各区域的租房信息
28+
:return: None
29+
"""
30+
for ty, type_code in self.rent_type.items(): # 整租、合租
31+
for city, info in self.city_info.items(): # 城市、城市各区的信息
32+
for dist, dist_py in info[2].items(): # 各区及其拼音
33+
res_bc = requests.get('https://m.lianjia.com/chuzu/{}/zufang/{}/'.format(info[1], dist_py))
34+
pa_bc = r"data-type=\"bizcircle\" data-key=\"(.*)\" class=\"oneline \">"
35+
bc_list = re.findall(pa_bc, res_bc.text)
36+
self._write_bc(bc_list)
37+
bc_list = self._read_bc() # 先爬取各区的商圈,最终以各区商圈来爬数据,如果按区爬,每区最多只能获得2000条数据
38+
39+
if len(bc_list) > 0:
40+
for bc_name in bc_list:
41+
idx = 0
42+
has_more = 1
43+
while has_more:
44+
try:
45+
url = 'https://app.api.lianjia.com/Rentplat/v1/house/list?city_id={}&condition={}' \
46+
'/rt{}&limit=30&offset={}&request_ts={}&scene=list'.format(info[0],
47+
bc_name,
48+
type_code,
49+
idx*30,
50+
int(time.time()))
51+
res = requests.get(url=url, timeout=10)
52+
print('成功爬取{}市{}-{}的{}第{}页数据!'.format(city, dist, bc_name, ty, idx+1))
53+
item = {'city': city, 'type': ty, 'dist': dist}
54+
self._parse_record(res.json()['data']['list'], item)
55+
56+
total = res.json()['data']['total']
57+
idx += 1
58+
if total/30 <= idx:
59+
has_more = 0
60+
# time.sleep(random.random())
61+
except:
62+
print('链接访问不成功,正在重试!')
63+
64+
def _parse_record(self, data, item):
65+
"""
66+
解析函数,用于解析爬回来的response的json数据
67+
:param data: 一个包含房源数据的列表
68+
:param item: 传递字典
69+
:return: None
70+
"""
71+
if len(data) > 0:
72+
for rec in data:
73+
item['bedroom_num'] = rec.get('frame_bedroom_num')
74+
item['hall_num'] = rec.get('frame_hall_num')
75+
item['bathroom_num'] = rec.get('frame_bathroom_num')
76+
item['rent_area'] = rec.get('rent_area')
77+
item['house_title'] = rec.get('house_title')
78+
item['resblock_name'] = rec.get('resblock_name')
79+
item['bizcircle_name'] = rec.get('bizcircle_name')
80+
item['layout'] = rec.get('layout')
81+
item['rent_price_listing'] = rec.get('rent_price_listing')
82+
item['house_tag'] = self._parse_house_tags(rec.get('house_tags'))
83+
item['frame_orientation'] = rec.get('frame_orientation')
84+
item['m_url'] = rec.get('m_url')
85+
item['rent_price_unit'] = rec.get('rent_price_unit')
86+
87+
try:
88+
res2 = requests.get(item['m_url'], timeout=5)
89+
pa_lon = r"longitude: '(.*)',"
90+
pa_lat = r"latitude: '(.*)'"
91+
pa_distance = r"<span class=\"fr\">(\d*)米</span>"
92+
item['longitude'] = re.findall(pa_lon, res2.text)[0]
93+
item['latitude'] = re.findall(pa_lat, res2.text)[0]
94+
distance = re.findall(pa_distance, res2.text)
95+
if len(distance) > 0:
96+
item['distance'] = distance[0]
97+
else:
98+
item['distance'] = None
99+
except:
100+
item['longitude'] = None
101+
item['latitude'] = None
102+
item['distance'] = None
103+
104+
self.db['zufang'].update_one({'m_url': item['m_url']}, {'$set': item}, upsert=True)
105+
print('成功保存数据:{}!'.format(item))
106+
107+
@staticmethod
108+
def _parse_house_tags(house_tag):
109+
"""
110+
处理house_tags字段,相当于数据清洗
111+
:param house_tag: house_tags字段的数据
112+
:return: 处理后的house_tags
113+
"""
114+
if len(house_tag) > 0:
115+
st = ''
116+
for tag in house_tag:
117+
st += tag.get('name') + ' '
118+
return st.strip()
119+
120+
@staticmethod
121+
def _write_bc(bc_list):
122+
"""
123+
把爬取的商圈写入txt,为了整个爬取过程更加可控
124+
:param bc_list: 商圈list
125+
:return: None
126+
"""
127+
with open('bc_list.txt', 'w') as f:
128+
for bc in bc_list:
129+
f.write(bc+'\n')
130+
131+
@staticmethod
132+
def _read_bc():
133+
"""
134+
读入商圈
135+
:return: None
136+
"""
137+
with open('bc_list.txt', 'r') as f:
138+
return [bc.strip() for bc in f.readlines()]
139+
140+
141+
if __name__ == '__main__':
142+
rent = Rent()
143+
rent.get_data()

BSGS_Rent/info.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
rent_type = {'整租': 200600000001, '合租': 200600000002}
2+
3+
city_info = {'北京': [110000, 'bj', {'东城': 'dongcheng', '西城': 'xicheng', '朝阳': 'chaoyang', '海淀': 'haidian',
4+
'丰台': 'fengtai', '石景山': 'shijingshan', '通州': 'tongzhou', '昌平': 'changping',
5+
'大兴': 'daxing', '亦庄开发区': 'yizhuangkaifaqu', '顺义': 'shunyi', '房山': 'fangshan',
6+
'门头沟': 'mentougou', '平谷': 'pinggu', '怀柔': 'huairou', '密云': 'miyun',
7+
'延庆': 'yanqing'}],
8+
'上海': [310000, 'sh', {'静安': 'jingan', '徐汇': 'xuhui', '黄浦': 'huangpu', '长宁': 'changning',
9+
'普陀': 'putuo', '浦东': 'pudong', '宝山': 'baoshan', '闸北': 'zhabei',
10+
'虹口': 'hongkou','杨浦': 'yangpu', '闵行': 'minhang', '金山': 'jinshan',
11+
'嘉定': 'jiading','崇明': 'chongming', '奉贤': 'fengxian', '松江': 'songjiang',
12+
'青浦': 'qingpu'}],
13+
'广州': [440100, 'gz', {'天河': 'tianhe', '越秀': 'yuexiu', '荔湾': 'liwan', '海珠': 'haizhu', '番禺': 'panyu',
14+
'白云': 'baiyun', '黄埔': 'huangpu', '从化': 'conghua', '增城': 'zengcheng',
15+
'花都': 'huadu', '南沙': 'nansha'}],
16+
'深圳': [440300, 'sz', {'罗湖区': 'luohuqu', '福田区': 'futianqu', '南山区': 'nanshanqu',
17+
'盐田区': 'yantianqu', '宝安区': 'baoanqu', '龙岗区': 'longgangqu',
18+
'龙华区': 'longhuaqu', '光明区': 'guangmingqu', '坪山区': 'pingshanqu',
19+
'大鹏新区': 'dapengxinqu'}]}

0 commit comments

Comments
 (0)