forked from Python-World/python-mini-projects
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquote_scraper.py
More file actions
49 lines (38 loc) · 1.41 KB
/
quote_scraper.py
File metadata and controls
49 lines (38 loc) · 1.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from bs4 import BeautifulSoup
import requests
import csv
# URL to the website
url='http://quotes.toscrape.com'
# Getting the html file and parsing with html.parser
html=requests.get(url)
bs=BeautifulSoup(html.text,'html.parser')
# Tries to open the file
try:
csv_file=open('quote_list.csv','w')
fieldnames=['quote','author','tags']
dictwriter=csv.DictWriter(csv_file,fieldnames=fieldnames)
# Writes the headers
dictwriter.writeheader()
#While next button is found in the page the loop runs
while True:
# Loops through quote in the page
for quote in bs.findAll('div',{'class':'quote'}):
#Extract the text part of quote, author and tags
text=quote.find('span',{'class':'text'}).text
author=quote.find('small',{'class':'author'}).text
tags=[]
for tag in quote.findAll('a',{'class':'tag'}):
tags.append(tag.text)
#Writes the current quote,author and tags to a csv file
dictwriter.writerow({'quote':text,'author':author,'tags':tags})
#Finds the link to next page
next=bs.find('li',{'class':'next'})
if not next:
break
#Gets and parses the html file of next page
html=requests.get(url+next.a.attrs['href'])
bs=BeautifulSoup(html.text,'html.parser')
except:
print('Unknown Error!!!')
finally:
csv_file.close()