Skip to content

Commit ae37deb

Browse files
committed
优化爬虫基础挑战
1 parent 4d2edf9 commit ae37deb

5 files changed

Lines changed: 7 additions & 6 deletions

File tree

10-crawl-github-user-repositories/githubspider.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ def start_urls(self):
1212
def parse(self, response):
1313
for repository in response.css('li.public'):
1414
yield {
15-
'name': repository.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first("\n\s*(.*)"),
15+
'name': repository.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first(r'\n\s*(.*)'),
1616
'update_time': repository.xpath('.//relative-time/@datetime').extract_first()
1717
}

11-save-github-repositories-to-mysql/shiyanlou/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
class Repository(Base):
1111
__tablename__ = 'repositories'
12+
1213
id = Column(Integer, primary_key=True)
1314
name = Column(String(64))
1415
update_time = Column(DateTime)

11-save-github-repositories-to-mysql/shiyanlou/spiders/github.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def start_urls(self):
1414
def parse(self, response):
1515
for repository in response.css('li.public'):
1616
item = GithubItem({
17-
'name': repository.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first("\n\s*(.*)"),
17+
'name': repository.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first(r'\n\s*(.*)'),
1818
'update_time': repository.xpath('.//relative-time/@datetime').extract_first()
1919
})
2020
yield item

12-crawl-github-commits-branches-releases/shiyanlou/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
class Repository(Base):
1111
__tablename__ = 'repositories'
12+
1213
id = Column(Integer, primary_key=True)
1314
name = Column(String(64))
1415
update_time = Column(DateTime)

12-crawl-github-commits-branches-releases/shiyanlou/spiders/github.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def parse(self, response):
1515
for repository in response.css('li.public'):
1616
item = GithubItem()
1717
item['name'] = repository.xpath(
18-
'.//a[@itemprop="name codeRepository"]/text()').re_first("\n\s*(.*)")
18+
'.//a[@itemprop="name codeRepository"]/text()').re_first(r'\n\s*(.*)')
1919
item['update_time'] = repository.xpath(
2020
'.//relative-time/@datetime').extract_first()
2121
repo_url = response.urljoin(
@@ -25,7 +25,6 @@ def parse(self, response):
2525
yield request
2626

2727
# 如果 Next 按钮没被禁用,那么表示有下一页
28-
# Scrapy 不支持 CSS :last-child 选择器
2928
spans = response.css('div.pagination span.disabled::text')
3029
if len(spans) == 0 or spans[-1].extract() != 'Next':
3130
next_url = response.css(
@@ -35,9 +34,9 @@ def parse(self, response):
3534
def parse_repo(self, response):
3635
item = response.meta['item']
3736
for number in response.css('ul.numbers-summary li'):
38-
type_text = number.xpath('.//a/text()').re_first('\n\s*(.*)\n')
37+
type_text = number.xpath('.//a/text()').re_first(r'\n\s*(.*)\n')
3938
number_text = number.xpath(
40-
'.//span[@class="num text-emphasized"]/text()').re_first('\n\s*(.*)\n')
39+
'.//span[@class="num text-emphasized"]/text()').re_first(r'\n\s*(.*)\n')
4140
if type_text and number_text:
4241
number_text = number_text.replace(',', '')
4342
if type_text in ('commit', 'commits'):

0 commit comments

Comments
 (0)