Skip to content

Commit 479cc1a

Browse files
committed
add Chapter11
1 parent 4b83e9f commit 479cc1a

14 files changed

Lines changed: 184 additions & 0 deletions

File tree

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from PIL import Image, ImageFilter
2+
3+
kitten = Image.open("kitten.jpg")
4+
blurryKitten = kitten.filter(ImageFilter.GaussianBlur)
5+
blurryKitten.save("kitten_blurred.jpg")
6+
blurryKitten.show()
3.17 KB
Loading
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
r S MAS
2+
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
'''
2+
利用 Pillow 库,我们可以创建一个 阈值过滤器来去掉渐变的背景色,只把文字留下来,从而让图片更加清晰
3+
'''
4+
5+
from PIL import Image
6+
import subprocess
7+
8+
def cleanFile(filePath, newFilePath):
9+
image = Image.open(filePath)
10+
11+
#Set a threshold value for the image, and save
12+
image = image.point(lambda x: 0 if x<143 else 255)
13+
image.save(newFilePath)
14+
15+
16+
#子进程调用tesseract Tesseract 最大的缺点是对渐变背景色的处理
17+
subprocess.call(["tesseract", newFilePath, "test"])
18+
19+
#Open and read the resulting data file
20+
outputFile = open("test.txt", 'r')
21+
print(outputFile.read())
22+
outputFile.close()
23+
24+
cleanFile("text.png", "text_clean.png")
18.7 KB
Loading
7.82 KB
Loading
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
This IS some text. wntten In Anal, that will be ,
2+
Tesseracl Here are some symbols: IWM .
3+
68.3 KB
Loading
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
WEI‘ nrrd Peace
2+
Len Nlkelayevldu Iolfluy
3+
4+
Readmg shmdd be ax
5+
wlnvame asnossxble Wenfler
6+
an mm m our cram: Llhvary
7+
8+
— Leo Tmsloy was a Russian rwovelwst
9+
I and moval phflmopher med lur
10+
A ms Ideas 01 nonviolenx reswslance m 5 We range 0, “and”
11+
12+
M.
13+
known for ms genevosxly to the '°'”"“‘ ‘”' "*°’“‘* W‘"'
14+
15+
reading dwnmuie,—a|\
16+
‘L °““"“‘ opnmizedfarreadabIh(y—so
17+
readerscan tho the fnrmat
18+
19+
ms best knawn nuvckave “War and
20+
21+
Peace" (1869),w>v(h msmy regarded as an em ‘”“”L"‘5"‘°"‘ W‘ 599 ""19
22+
vamer man a novel, and "Anna Kavemna" mm) “” '“""
23+
24+
Nxswark was admued m hiswne by Doxlayevxky,
25+
26+
Chxkoxa Turgenev, and Flauben and Water by Tm EaSyREad super
27+
28+
wgmiz wow and )ame§Joy<e Large 24 Eamon ‘S
29+
"we: and Peace" 2 bnlhant prose epm by Tolstoy °'.“'m'Zed for readers
30+
He pmsemsthe mm; M1892 axe mom msaae. "‘{"h ‘eVe'9‘Y "Ed“<9d
31+
when m: Russwanswon agamsune Napo\zcn>c vlsxon
32+
onshuqhtlhrmlghlhevadherenzr(owrme He
33+
tamures the essenze af Ne wwlh an R5 mancnges,
34+
hardsmps andmys The maepm poV\raya\ at
35+
numemus thataclerx lrom an xpheves m We wands
36+
u unparalleled reahsm Aime dassm‘
37+
38+
39+
40+
su+u27o3ro2ar1
41+
42+
||I||| H
43+
44+
cuvmumeu .21mna
45+
46+
47+
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import time
2+
from urllib.request import urlretrieve
3+
import subprocess
4+
from selenium import webdriver
5+
6+
#driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs')
7+
driver = webdriver.Firefox()
8+
driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200")
9+
time.sleep(2)
10+
11+
driver.find_element_by_id("img-canvas").click()
12+
#The easiest way to get exactly one of every page
13+
imageList = set()
14+
15+
#Wait for the page to load
16+
time.sleep(10)
17+
print(driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"))
18+
while "pointer" in driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"):
19+
#While we can click on the right arrow, move through the pages
20+
driver.find_element_by_id("sitbReaderRightPageTurner").click()
21+
time.sleep(2)
22+
#Get any new pages that have loaded (multiple pages can load at once)
23+
pages = driver.find_elements_by_xpath("//div[@class='pageImage']/div/img")
24+
for page in pages:
25+
image = page.get_attribute("src")
26+
imageList.add(image)
27+
28+
driver.quit()
29+
30+
#Start processing the images we've collected URLs for with Tesseract
31+
for image in sorted(imageList):
32+
urlretrieve(image, "page.jpg")
33+
p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
34+
p.wait()
35+
f = open("page.txt", "r")
36+
print(f.read())

0 commit comments

Comments
 (0)