|
| 1 | +from urllib.request import urlopen |
| 2 | +from urllib.parse import unquote |
| 3 | +import random |
| 4 | +import re |
| 5 | +from bs4 import BeautifulSoup |
| 6 | +import unittest |
| 7 | + |
| 8 | +class TestWikipedia(unittest.TestCase): |
| 9 | + |
| 10 | + bsObj = None |
| 11 | + url = None |
| 12 | + |
| 13 | + |
| 14 | + def test_PageProperties(self): |
| 15 | + global bsObj |
| 16 | + global url |
| 17 | + |
| 18 | + url = "http://en.wikipedia.org/wiki/Monty_Python" |
| 19 | + #Test the first 100 pages we encounter |
| 20 | + for i in range(1, 100): |
| 21 | + bsObj = BeautifulSoup(urlopen(url), "html.parser") |
| 22 | + titles = self.titleMatchesURL() |
| 23 | + self.assertEquals(titles[0], titles[1]) |
| 24 | + self.assertTrue(self.contentExists()) |
| 25 | + url = self.getNextLink() |
| 26 | + print("Done!") |
| 27 | + |
| 28 | + #测试标题 |
| 29 | + def titleMatchesURL(self): |
| 30 | + global bsObj |
| 31 | + global url |
| 32 | + pageTitle = bsObj.find("h1").get_text() |
| 33 | + urlTitle = url[(url.index("/wiki/")+6):] |
| 34 | + urlTitle = urlTitle.replace("_", " ") |
| 35 | + urlTitle = unquote(urlTitle) |
| 36 | + return [pageTitle.lower(), urlTitle.lower()] |
| 37 | + |
| 38 | + def contentExists(self): |
| 39 | + global bsObj |
| 40 | + content = bsObj.find("div",{"id":"mw-content-text"}) |
| 41 | + if content is not None: |
| 42 | + return True |
| 43 | + return False |
| 44 | + |
| 45 | + def getNextLink(self): |
| 46 | + global bsObj |
| 47 | + links = bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")) |
| 48 | + link = links[random.randint(0, len(links)-1)].attrs['href'] |
| 49 | + print("Next link is: "+link) |
| 50 | + return "http://en.wikipedia.org"+link |
| 51 | + |
| 52 | +if __name__ == '__main__': |
| 53 | + unittest.main() |
0 commit comments