Skip to content

Commit 4a73d9b

Browse files
Merge pull request avinashkranjan#919 from iamakkkhil/master
avinashkranjan#913 Codeforces_Problem_Scraper Added
2 parents 056b6b8 + a086be2 commit 4a73d9b

3 files changed

Lines changed: 191 additions & 0 deletions

File tree

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import os
2+
from selenium import webdriver # Automated webdriver
3+
from PIL import Image
4+
from fpdf import FPDF # For converting images to pdf
5+
6+
7+
def select_difficulty():
8+
"""
9+
This function will let user to choose the difficulty level
10+
:return: difficulty_level[]
11+
"""
12+
difficulty_level = []
13+
print("\nEnter the Range of difficulty between 800 to 3500: ")
14+
difficulty_level.append(int(input("Min: ")))
15+
difficulty_level.append(int(input("Max: ")))
16+
17+
return difficulty_level
18+
19+
20+
def extracting_problem_links(diff_level):
21+
"""
22+
This function saves first saves the link of the pages to scrape from
23+
and then the link of every question, saves it in list
24+
:param diff_level: difficulty_level entered by the user
25+
:return pblms_links: consists of all the available questions to scrape
26+
"""
27+
no_of_questions = int(input("\nHow many Questions you want to scrape: "))
28+
29+
pblms_link_scraped = 0
30+
pblms_links = []
31+
page = 1
32+
options = webdriver.ChromeOptions()
33+
options.headless = True
34+
driver = webdriver.Chrome(DRIVER_PATH, options=options)
35+
print("\nRequesting URL ...")
36+
driver.get(f"https://codeforces.com/problemset/?tags={diff_level[0]}-{diff_level[1]}")
37+
38+
# ===================Getting no. of Pages to Scrape=============================
39+
40+
# It will give the total no. of pages present with that question from
41+
# which we are going to scrape
42+
page_links = []
43+
44+
print("\nFinding available pages to scrape....")
45+
46+
available_pages = driver.find_elements_by_css_selector("div.pagination a")
47+
for page_no in available_pages:
48+
page_links.append(page_no.get_attribute("href"))
49+
50+
print(f"Available Pages to scrape are: {len(page_links[:-1])}")
51+
52+
# ===================================================================================
53+
54+
# ***************************** SCRAPING PAGE 1 *************************************
55+
print(f"\nScraping Page {page}")
56+
57+
elements = driver.find_elements_by_css_selector("td.id.dark.left a" and "td.id.left a")
58+
for element in elements:
59+
# Saving the link in pblms_links
60+
pblms_links.append(element.get_attribute("href"))
61+
pblms_link_scraped += 1
62+
63+
# If we scraped required no. of questions then return
64+
if pblms_link_scraped == no_of_questions:
65+
print(f"URLs of Question Scraped till now: {pblms_link_scraped}")
66+
print(f"\nURLs Scrapped Successfully {pblms_link_scraped} out of {no_of_questions}")
67+
return pblms_links
68+
page += 1
69+
print(f"URLs of Question Scraped till now: {pblms_link_scraped}")
70+
# *************************************************************************************
71+
72+
# ----------------------------- SCRAPING SUBSEQUENT PAGES -----------------------------
73+
for link in page_links[1:-1]:
74+
print(f"\nScraping Page {page}")
75+
76+
# Going to next Page
77+
driver.get(link)
78+
elements = driver.find_elements_by_css_selector("td.id.dark.left a" and "td.id.left a")
79+
for element in elements:
80+
# Saving the link in pblms_links
81+
pblms_links.append(element.get_attribute("href"))
82+
pblms_link_scraped += 1
83+
84+
# If we scraped required no. of questions then return
85+
if pblms_link_scraped == no_of_questions:
86+
print(f"URLs of Question Scraped till now: {pblms_link_scraped}")
87+
print(f"\nURLs Scrapped Successfully {pblms_link_scraped} out of {no_of_questions}")
88+
return pblms_links
89+
90+
print(f"URLs of Question Scraped till now: {pblms_link_scraped}")
91+
page += 1
92+
# ----------------------------------------------------------------------------------------------
93+
94+
# scraped all the available questions but still the count is less
95+
print(f"\n{pblms_link_scraped} out of {no_of_questions} URLs able to scrapped !!!")
96+
return pblms_links
97+
98+
99+
def getproblem(URLs):
100+
"""
101+
getproblem() : It takes input from the user of codeforces problemID and difficulty
102+
level and then by using selenium and chrome webdriver, capturing screenshot of the
103+
Codeforces problem using ttypography tag because all the problems of codeforces are
104+
stored inside this div tag and saving it in a image.png file.
105+
Then saving the image.png as pdf file by using fdf library.
106+
"""
107+
108+
path = 'image.png'
109+
110+
# Creating a Target Output Folder
111+
target_folder = './Coderforces_Problem_Scrapper/problems_pdf'
112+
if not os.path.exists(target_folder):
113+
os.makedirs(target_folder)
114+
115+
options = webdriver.ChromeOptions()
116+
# Headless = True for taking a scrolling snapshot
117+
options.headless = True
118+
driver = webdriver.Chrome(DRIVER_PATH, options=options)
119+
file_counter = 1
120+
121+
for url in URLs:
122+
driver.get(url)
123+
# Deciding height by tag
124+
required_height = driver.execute_script(
125+
'return document.body.parentNode.scrollHeight')
126+
driver.set_window_size(1366, required_height)
127+
128+
title = driver.find_element_by_class_name("title").text
129+
filename = title[3:] + '.pdf'
130+
131+
# Taking SS of everything within the ttypography class
132+
driver.find_element_by_class_name('ttypography').screenshot(path)
133+
134+
# Opening image with pillow so based to capture its height and width
135+
cover = Image.open(path)
136+
WIDTH, HEIGHT = cover.size
137+
MARGIN = 10
138+
# based on image's height and width we are adjusting the pdf margin and borders
139+
pdf = FPDF(unit='pt', format=[WIDTH + 2 * MARGIN, HEIGHT + 2 * MARGIN])
140+
pdf.add_page() # Adding new page to the pdf
141+
pdf.image(path, MARGIN, MARGIN)
142+
143+
pdf.output(os.path.join(target_folder, filename), "F") # saving the pdf with the specified filename
144+
print(f'File saved in your directory ./problems_pdf/{filename} ({file_counter}/{len(URLs)}) !')
145+
file_counter += 1
146+
147+
148+
if __name__ == "__main__":
149+
DRIVER_PATH = input("Enter DRIVER PATH location: ")
150+
diff = select_difficulty() # Accepting difficulty level from user
151+
problems_link = extracting_problem_links(diff) # scraping the required the no. of links
152+
getproblem(problems_link) # saving the Questions in PDF file.
153+
os.remove('image.png')
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Save any number of Problem Statement you like from Codeforces as a PDF.
2+
3+
This python script will let you download any number of Problem Statements from Codeforces and save them as a pdf file. The script uses Selenium Webdriver and fpdf library. Selenium is used with Chrome Webdriver, so having Chrome browser is a requirement.
4+
5+
## Setting up:
6+
7+
- Create a virtual environment and activate it.
8+
9+
- Install the requirements
10+
11+
```sh
12+
$ pip install -r requirements.txt
13+
```
14+
15+
## Running the script:
16+
17+
```sh
18+
$ python Codeforces_Problem_Scrapper.py
19+
```
20+
21+
## Terminal Screenshot:
22+
23+
![Imgur](https://i.imgur.com/gqHMxMz.png)
24+
25+
The program will ask you to enter:
26+
1. DRIVER PATH
27+
2. VALID Difficulty Range of PROBLEMS.
28+
3. Number of Questions to Scrape.
29+
30+
## PDF Output:
31+
![Imgur](https://i.imgur.com/1iMC7PE.png)
32+
![GIF](https://media.giphy.com/media/lQ95K1IzUGB2tiqlmZ/giphy.gif)
33+
34+
## Author
35+
[ Akhil Bhalerao ](https://github.com/iamakkkhil)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
pillow
2+
fpdf
3+
selenium

0 commit comments

Comments
 (0)