Skip to content

Commit bec3a28

Browse files
authored
Add files via upload
1 parent 8cd2a0a commit bec3a28

1 file changed

Lines changed: 63 additions & 0 deletions

File tree

Roads/Country_Raod.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import re
2+
import requests
3+
from lxml import etree
4+
from pymongo import MongoClient
5+
6+
7+
class Roads(object):
8+
def __init__(self):
9+
self.city_list = None
10+
self.alphabets = None
11+
self.na_city = []
12+
client = MongoClient('127.0.0.1', 27017)
13+
self.col = client.get_database('Country_Road').get_collection('city_roads')
14+
15+
def get_city_list(self):
16+
url = 'http://www.city8.com/'
17+
res = requests.get(url)
18+
19+
pat1 = r"city8.com/'>(.*?)</a></li>"
20+
pat2 = r"<a target='_blank' href='(.*?)/'>"
21+
city = re.findall(string=res.text, pattern=pat1)
22+
href = re.findall(string=res.text, pattern=pat2)
23+
self.city_list = dict(zip(city, href))
24+
print('Got city list!')
25+
26+
def get_alphabet(self):
27+
url = 'http://gz.city8.com/road/A/'
28+
res = requests.get(url)
29+
parsed = etree.HTML(res.text)
30+
self.alphabets = parsed.xpath("/html/body/div/div[2]/div[3]/div[1]/div[1]/div[1]/a/text()")
31+
print('Got alphabets!')
32+
33+
def get_city_roads(self):
34+
for city, href in self.city_list.items():
35+
res_test = requests.get(href+'/road')
36+
37+
if res_test.text.find('/road/a/') != -1:
38+
print('Crawling road data of city: {}'.format(city))
39+
40+
for alpha in self.alphabets:
41+
res_road = requests.get(href+'/road/'+alpha)
42+
parsed = etree.HTML(res_road.text)
43+
roads = parsed.xpath('/html/body/div/div[2]/div[3]/div[1]/div[2]/a/text()')
44+
if len(roads) > 0:
45+
for rd in roads:
46+
self.col.insert_one({'city': city, 'road': rd.strip()})
47+
print('Successfully crawled city: {}, alphabet: {}'.format(city, alpha))
48+
else:
49+
print('City: {} alphabet: {} got no data'.format(city, alpha))
50+
51+
else:
52+
print('There is no road data of city: {}'.format(city))
53+
self.na_city.append(city)
54+
55+
print('These are cities with no road data: {} \n '
56+
'You might want to crawl road data of these cities from elsewhere.'.format(self.na_city))
57+
58+
59+
if __name__ == '__main__':
60+
r = Roads()
61+
r.get_city_list()
62+
r.get_alphabet()
63+
r.get_city_roads()

0 commit comments

Comments
 (0)