lambda70v
diff --git a/‎Web Scraping with Python/Chapter11/basic-image.py‎
Lines changed: 6 additions & 0 deletions b/‎Web Scraping with Python/Chapter11/basic-image.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎Web Scraping with Python/Chapter11/captcha.jpg‎
3.17 KB b/‎Web Scraping with Python/Chapter11/captcha.jpg‎
3.17 KB
diff --git a/‎Web Scraping with Python/Chapter11/captcha.txt‎
Lines changed: 2 additions & 0 deletions b/‎Web Scraping with Python/Chapter11/captcha.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Web Scraping with Python/Chapter11/clean-image.py‎
Lines changed: 24 additions & 0 deletions b/‎Web Scraping with Python/Chapter11/clean-image.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎Web Scraping with Python/Chapter11/kitten.jpg‎
18.7 KB b/‎Web Scraping with Python/Chapter11/kitten.jpg‎
18.7 KB
diff --git a/‎Web Scraping with Python/Chapter11/kitten_blurred.jpg‎
7.82 KB b/‎Web Scraping with Python/Chapter11/kitten_blurred.jpg‎
7.82 KB
diff --git a/‎Web Scraping with Python/Chapter11/output.txt‎
Lines changed: 3 additions & 0 deletions b/‎Web Scraping with Python/Chapter11/output.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Web Scraping with Python/Chapter11/page.jpg‎
68.3 KB b/‎Web Scraping with Python/Chapter11/page.jpg‎
68.3 KB
diff --git a/‎Web Scraping with Python/Chapter11/page.txt‎
Lines changed: 47 additions & 0 deletions b/‎Web Scraping with Python/Chapter11/page.txt‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎Web Scraping with Python/Chapter11/read-web-images.py‎
Lines changed: 36 additions & 0 deletions b/‎Web Scraping with Python/Chapter11/read-web-images.py‎
Lines changed: 36 additions & 0 deletions
@@ -0,0 +1,6 @@
+from PIL import Image, ImageFilter
+
+kitten = Image.open("kitten.jpg")
+blurryKitten = kitten.filter(ImageFilter.GaussianBlur)
+blurryKitten.save("kitten_blurred.jpg")
+blurryKitten.show()
@@ -0,0 +1,2 @@
+r S MAS
+
@@ -0,0 +1,24 @@
+'''
+利用 Pillow 库,我们可以创建一个 阈值过滤器来去掉渐变的背景色,只把文字留下来,从而让图片更加清晰
+'''
+
+from PIL import Image
+import subprocess
+
+def cleanFile(filePath, newFilePath):
+    image = Image.open(filePath)
+
+    #Set a threshold value for the image, and save
+    image = image.point(lambda x: 0 if x<143 else 255)
+    image.save(newFilePath)
+
+
+    #子进程调用tesseract  Tesseract 最大的缺点是对渐变背景色的处理
+    subprocess.call(["tesseract", newFilePath, "test"])
+    
+    #Open and read the resulting data file
+    outputFile = open("test.txt", 'r')
+    print(outputFile.read())
+    outputFile.close()
+
+cleanFile("text.png", "text_clean.png")
@@ -0,0 +1,3 @@
+This IS some text. wntten In Anal, that will be ,
+Tesseracl Here are some symbols: IWM .
+
@@ -0,0 +1,47 @@
+WEI‘ nrrd Peace
+Len Nlkelayevldu Iolﬂuy
+
+Readmg shmdd be ax
+wlnvame asnossxble Wenﬂer
+an mm m our cram: Llhvary
+
+— Leo Tmsloy was a Russian rwovelwst
+I and moval phﬂmopher med lur
+A ms Ideas 01 nonviolenx reswslance m 5 We range 0, “and”
+
+M.   
+known for ms genevosxly to the '°'”"“‘ ‘”' "*°’“‘* W‘"'
+
+reading dwnmuie,—a|\
+‘L  °““"“‘ opnmizedfarreadabIh(y—so
+readerscan tho the fnrmat
+
+ms best knawn nuvckave “War and
+
+Peace" (1869),w>v(h msmy regarded as an em ‘”“”L"‘5"‘°"‘ W‘ 599 ""19
+vamer man a novel, and "Anna Kavemna" mm) “” '“""
+
+Nxswark was admued m hiswne by Doxlayevxky,
+
+Chxkoxa Turgenev, and Flauben and Water by Tm EaSyREad super
+
+wgmiz wow and )ame§Joy<e Large 24 Eamon ‘S
+"we: and Peace" 2 bnlhant prose epm by Tolstoy °'.“'m'Zed for readers
+He pmsemsthe mm; M1892 axe mom msaae. "‘{"h ‘eVe'9‘Y "Ed“<9d
+when m: Russwanswon agamsune Napo\zcn>c vlsxon
+onshuqhtlhrmlghlhevadherenzr(owrme He
+tamures the essenze af Ne wwlh an R5 mancnges,
+hardsmps andmys The maepm poV\raya\ at
+numemus thataclerx lrom an xpheves m We wands
+u unparalleled reahsm Aime dassm‘
+
+ 
+
+su+u27o3ro2ar1
+
+||I||| H
+
+cuvmumeu .21mna
+
+ 
+
@@ -0,0 +1,36 @@
+import time
+from urllib.request import urlretrieve
+import subprocess
+from selenium import webdriver
+
+#driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs')
+driver = webdriver.Firefox()
+driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200")
+time.sleep(2)
+
+driver.find_element_by_id("img-canvas").click()
+#The easiest way to get exactly one of every page
+imageList = set()
+
+#Wait for the page to load
+time.sleep(10)
+print(driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"))
+while "pointer" in driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"):
+    #While we can click on the right arrow, move through the pages
+    driver.find_element_by_id("sitbReaderRightPageTurner").click()
+    time.sleep(2)
+    #Get any new pages that have loaded (multiple pages can load at once)
+    pages = driver.find_elements_by_xpath("//div[@class='pageImage']/div/img")
+    for page in pages:
+        image = page.get_attribute("src")
+        imageList.add(image)
+
+driver.quit()
+
+#Start processing the images we've collected URLs for with Tesseract
+for image in sorted(imageList):
+    urlretrieve(image, "page.jpg")
+    p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+    p.wait()
+    f = open("page.txt", "r")
+    print(f.read())
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+This IS some text. wntten In Anal, that will be ,`
	`2`	`+Tesseracl Here are some symbols: IWM .`
	`3`	`+`