|
| 1 | +import re |
| 2 | +import requests |
| 3 | +from lxml import etree |
| 4 | +from pymongo import MongoClient |
| 5 | + |
| 6 | + |
| 7 | +class Roads(object): |
| 8 | + def __init__(self): |
| 9 | + self.city_list = None |
| 10 | + self.alphabets = None |
| 11 | + self.na_city = [] |
| 12 | + client = MongoClient('127.0.0.1', 27017) |
| 13 | + self.col = client.get_database('Country_Road').get_collection('city_roads') |
| 14 | + |
| 15 | + def get_city_list(self): |
| 16 | + url = 'http://www.city8.com/' |
| 17 | + res = requests.get(url) |
| 18 | + |
| 19 | + pat1 = r"city8.com/'>(.*?)</a></li>" |
| 20 | + pat2 = r"<a target='_blank' href='(.*?)/'>" |
| 21 | + city = re.findall(string=res.text, pattern=pat1) |
| 22 | + href = re.findall(string=res.text, pattern=pat2) |
| 23 | + self.city_list = dict(zip(city, href)) |
| 24 | + print('Got city list!') |
| 25 | + |
| 26 | + def get_alphabet(self): |
| 27 | + url = 'http://gz.city8.com/road/A/' |
| 28 | + res = requests.get(url) |
| 29 | + parsed = etree.HTML(res.text) |
| 30 | + self.alphabets = parsed.xpath("/html/body/div/div[2]/div[3]/div[1]/div[1]/div[1]/a/text()") |
| 31 | + print('Got alphabets!') |
| 32 | + |
| 33 | + def get_city_roads(self): |
| 34 | + for city, href in self.city_list.items(): |
| 35 | + res_test = requests.get(href+'/road') |
| 36 | + |
| 37 | + if res_test.text.find('/road/a/') != -1: |
| 38 | + print('Crawling road data of city: {}'.format(city)) |
| 39 | + |
| 40 | + for alpha in self.alphabets: |
| 41 | + res_road = requests.get(href+'/road/'+alpha) |
| 42 | + parsed = etree.HTML(res_road.text) |
| 43 | + roads = parsed.xpath('/html/body/div/div[2]/div[3]/div[1]/div[2]/a/text()') |
| 44 | + if len(roads) > 0: |
| 45 | + for rd in roads: |
| 46 | + self.col.insert_one({'city': city, 'road': rd.strip()}) |
| 47 | + print('Successfully crawled city: {}, alphabet: {}'.format(city, alpha)) |
| 48 | + else: |
| 49 | + print('City: {} alphabet: {} got no data'.format(city, alpha)) |
| 50 | + |
| 51 | + else: |
| 52 | + print('There is no road data of city: {}'.format(city)) |
| 53 | + self.na_city.append(city) |
| 54 | + |
| 55 | + print('These are cities with no road data: {} \n ' |
| 56 | + 'You might want to crawl road data of these cities from elsewhere.'.format(self.na_city)) |
| 57 | + |
| 58 | + |
| 59 | +if __name__ == '__main__': |
| 60 | + r = Roads() |
| 61 | + r.get_city_list() |
| 62 | + r.get_alphabet() |
| 63 | + r.get_city_roads() |
0 commit comments