Skip to content

Commit 1f2bdb1

Browse files
committed
change bs4 parser to lxml
1 parent 0688d1c commit 1f2bdb1

1 file changed

Lines changed: 18 additions & 18 deletions

File tree

zhihu.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def __init__(self, url, title=None):
107107

108108
def parser(self):
109109
r = requests.get(self.url)
110-
self.soup = BeautifulSoup(r.content)
110+
self.soup = BeautifulSoup(r.content, "lxml")
111111

112112
def get_title(self):
113113
if hasattr(self, "title"):
@@ -182,7 +182,7 @@ def get_all_answers(self):
182182
for j in xrange(min(answers_num, 20)):
183183
if self.soup == None:
184184
self.parser()
185-
soup = BeautifulSoup(self.soup.encode("utf-8"))
185+
soup = BeautifulSoup(self.soup.encode("utf-8"), "lxml")
186186

187187
is_my_answer = False
188188
if soup.find_all("div", class_="zm-item-answer")[j].find("span", class_="count") == None:
@@ -251,9 +251,9 @@ def get_all_answers(self):
251251

252252
answer_list = r.json()["msg"]
253253
for j in xrange(min(answers_num - i * 20, 20)):
254-
soup = BeautifulSoup(self.soup.encode("utf-8"))
254+
soup = BeautifulSoup(self.soup.encode("utf-8"), "lxml")
255255

256-
answer_soup = BeautifulSoup(answer_list[j])
256+
answer_soup = BeautifulSoup(answer_list[j], "lxml")
257257

258258
if answer_soup.find("div", class_="zm-editable-content clearfix") == None:
259259
continue
@@ -337,7 +337,7 @@ def __init__(self, user_url, user_id=None):
337337

338338
def parser(self):
339339
r = requests.get(self.user_url)
340-
soup = BeautifulSoup(r.content)
340+
soup = BeautifulSoup(r.content, "lxml")
341341
self.soup = soup
342342

343343
def get_user_id(self):
@@ -496,7 +496,7 @@ def get_followees(self):
496496
followee_url = self.user_url + "/followees"
497497
r = requests.get(followee_url)
498498

499-
soup = BeautifulSoup(r.content)
499+
soup = BeautifulSoup(r.content, "lxml")
500500
for i in xrange((followees_num - 1) / 20 + 1):
501501
if i == 0:
502502
user_url_list = soup.find_all("h2", class_="zm-list-content-title")
@@ -523,7 +523,7 @@ def get_followees(self):
523523

524524
followee_list = r_post.json()["msg"]
525525
for j in xrange(min(followees_num - i * 20, 20)):
526-
followee_soup = BeautifulSoup(followee_list[j])
526+
followee_soup = BeautifulSoup(followee_list[j], "lxml")
527527
user_link = followee_soup.find("h2", class_="zm-list-content-title").a
528528
yield User(user_link["href"], user_link.string.encode("utf-8"))
529529

@@ -541,7 +541,7 @@ def get_followers(self):
541541
follower_url = self.user_url + "/followers"
542542
r = requests.get(follower_url)
543543

544-
soup = BeautifulSoup(r.content)
544+
soup = BeautifulSoup(r.content, "lxml")
545545
for i in xrange((followers_num - 1) / 20 + 1):
546546
if i == 0:
547547
user_url_list = soup.find_all("h2", class_="zm-list-content-title")
@@ -567,7 +567,7 @@ def get_followers(self):
567567

568568
follower_list = r_post.json()["msg"]
569569
for j in xrange(min(followers_num - i * 20, 20)):
570-
follower_soup = BeautifulSoup(follower_list[j])
570+
follower_soup = BeautifulSoup(follower_list[j], "lxml")
571571
user_link = follower_soup.find("h2", class_="zm-list-content-title").a
572572
yield User(user_link["href"], user_link.string.encode("utf-8"))
573573

@@ -591,7 +591,7 @@ def get_asks(self):
591591
ask_url = self.user_url + "/asks?page=" + str(i + 1)
592592
r = requests.get(ask_url)
593593

594-
soup = BeautifulSoup(r.content)
594+
soup = BeautifulSoup(r.content, "lxml")
595595
for question in soup.find_all("a", class_="question_link"):
596596
url = "http://www.zhihu.com" + question["href"]
597597
title = question.string.encode("utf-8")
@@ -611,7 +611,7 @@ def get_answers(self):
611611
for i in xrange((answers_num - 1) / 20 + 1):
612612
answer_url = self.user_url + "/answers?page=" + str(i + 1)
613613
r = requests.get(answer_url)
614-
soup = BeautifulSoup(r.content)
614+
soup = BeautifulSoup(r.content, "lxml")
615615
for answer in soup.find_all("a", class_="question_link"):
616616
question_url = "http://www.zhihu.com" + answer["href"][0:18]
617617
question_title = answer.string.encode("utf-8")
@@ -634,7 +634,7 @@ def get_collections(self):
634634

635635
r = requests.get(collection_url)
636636

637-
soup = BeautifulSoup(r.content)
637+
soup = BeautifulSoup(r.content, "lxml")
638638
for collection in soup.find_all("div", class_="zm-profile-section-item zg-clear"):
639639
url = "http://www.zhihu.com" + \
640640
collection.find("a", class_="zm-profile-fav-item-title")["href"]
@@ -650,7 +650,7 @@ def get_likes(self):
650650
yield
651651
else:
652652
r = requests.get(self.user_url)
653-
soup = BeautifulSoup(r.content)
653+
soup = BeautifulSoup(r.content, "lxml")
654654
# Handle the first liked item
655655
first_item = soup.find("div", attrs={'class':'zm-profile-section-item zm-item clearfix'})
656656
first_item = first_item.find("div", attrs={'class':'zm-profile-section-main zm-profile-section-activity-main zm-profile-activity-page-item-main'})
@@ -717,7 +717,7 @@ def __init__(self, answer_url, question=None, author=None, upvote=None, content=
717717

718718
def parser(self):
719719
r = requests.get(self.answer_url)
720-
soup = BeautifulSoup(r.content)
720+
soup = BeautifulSoup(r.content, "lxml")
721721
self.soup = soup
722722

723723
def get_question(self):
@@ -772,7 +772,7 @@ def get_content(self):
772772
else:
773773
if self.soup == None:
774774
self.parser()
775-
soup = BeautifulSoup(self.soup.encode("utf-8"))
775+
soup = BeautifulSoup(self.soup.encode("utf-8"), "lxml")
776776
answer = soup.find("div", class_="zm-editable-content clearfix")
777777
soup.body.extract()
778778
soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'}))
@@ -962,7 +962,7 @@ def get_voters(self):
962962
# s = session
963963
# r = s.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)})
964964
r = requests.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)})
965-
soup = BeautifulSoup(r.content)
965+
soup = BeautifulSoup(r.content, "lxml")
966966
voters_info = soup.find_all("span")[1:-1]
967967
if len(voters_info) == 0:
968968
return
@@ -997,7 +997,7 @@ def __init__(self, url, name=None, creator=None):
997997
self.creator = creator
998998
def parser(self):
999999
r = requests.get(self.url)
1000-
soup = BeautifulSoup(r.content)
1000+
soup = BeautifulSoup(r.content, "lxml")
10011001
self.soup = soup
10021002

10031003
def get_name(self):
@@ -1062,7 +1062,7 @@ def get_all_answers(self):
10621062
i = 2
10631063
while True:
10641064
r = requests.get(self.url + "?page=" + str(i))
1065-
answer_soup = BeautifulSoup(r.content)
1065+
answer_soup = BeautifulSoup(r.content, "lxml")
10661066
answer_list = answer_soup.find_all("div", class_="zm-item")
10671067
if len(answer_list) == 0:
10681068
break

0 commit comments

Comments
 (0)