优化爬虫基础挑战

jaggerwang · jaggerwang · commit ae37deb0a0b0 · 2018-11-13T10:34:52.000+08:00
diff --git a/10-crawl-github-user-repositories/githubspider.py b/10-crawl-github-user-repositories/githubspider.py
@@ -12,6 +12,6 @@ def start_urls(self):
     def parse(self, response):
         for repository in response.css('li.public'):
             yield {
-                'name': repository.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first("\n\s*(.*)"),
+                'name': repository.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first(r'\n\s*(.*)'),
                 'update_time': repository.xpath('.//relative-time/@datetime').extract_first()
             }
diff --git a/11-save-github-repositories-to-mysql/shiyanlou/models.py b/11-save-github-repositories-to-mysql/shiyanlou/models.py
@@ -9,6 +9,7 @@
 
 class Repository(Base):
     __tablename__ = 'repositories'
+
     id = Column(Integer, primary_key=True)
     name = Column(String(64))
     update_time = Column(DateTime)
diff --git a/11-save-github-repositories-to-mysql/shiyanlou/spiders/github.py b/11-save-github-repositories-to-mysql/shiyanlou/spiders/github.py
@@ -14,7 +14,7 @@ def start_urls(self):
     def parse(self, response):
         for repository in response.css('li.public'):
             item = GithubItem({
-                'name': repository.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first("\n\s*(.*)"),
+                'name': repository.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first(r'\n\s*(.*)'),
                 'update_time': repository.xpath('.//relative-time/@datetime').extract_first()
             })
             yield item
diff --git a/12-crawl-github-commits-branches-releases/shiyanlou/models.py b/12-crawl-github-commits-branches-releases/shiyanlou/models.py
@@ -9,6 +9,7 @@
 
 class Repository(Base):
     __tablename__ = 'repositories'
+
     id = Column(Integer, primary_key=True)
     name = Column(String(64))
     update_time = Column(DateTime)
diff --git a/12-crawl-github-commits-branches-releases/shiyanlou/spiders/github.py b/12-crawl-github-commits-branches-releases/shiyanlou/spiders/github.py
@@ -15,7 +15,7 @@ def parse(self, response):
         for repository in response.css('li.public'):
             item = GithubItem()
             item['name'] = repository.xpath(
-                './/a[@itemprop="name codeRepository"]/text()').re_first("\n\s*(.*)")
+                './/a[@itemprop="name codeRepository"]/text()').re_first(r'\n\s*(.*)')
             item['update_time'] = repository.xpath(
                 './/relative-time/@datetime').extract_first()
             repo_url = response.urljoin(
@@ -25,7 +25,6 @@ def parse(self, response):
             yield request
 
         # 如果 Next 按钮没被禁用，那么表示有下一页
-        # Scrapy 不支持 CSS :last-child 选择器
         spans = response.css('div.pagination span.disabled::text')
         if len(spans) == 0 or spans[-1].extract() != 'Next':
             next_url = response.css(
@@ -35,9 +34,9 @@ def parse(self, response):
     def parse_repo(self, response):
         item = response.meta['item']
         for number in response.css('ul.numbers-summary li'):
-            type_text = number.xpath('.//a/text()').re_first('\n\s*(.*)\n')
+            type_text = number.xpath('.//a/text()').re_first(r'\n\s*(.*)\n')
             number_text = number.xpath(
-                './/span[@class="num text-emphasized"]/text()').re_first('\n\s*(.*)\n')
+                './/span[@class="num text-emphasized"]/text()').re_first(r'\n\s*(.*)\n')
             if type_text and number_text:
                 number_text = number_text.replace(',', '')
                 if type_text in ('commit', 'commits'):

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,6 @@ def start_urls(self):`
`12`	`12`	`def parse(self, response):`
`13`	`13`	`for repository in response.css('li.public'):`
`14`	`14`	`yield {`
`15`		`- 'name': repository.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first("\n\s(.)"),`
	`15`	`+ 'name': repository.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first(r'\n\s(.)'),`
`16`	`16`	`'update_time': repository.xpath('.//relative-time/@datetime').extract_first()`
`17`	`17`	`}`