|
7 | 7 | link_re = re.compile(r'href="(.*?)"') |
8 | 8 |
|
9 | 9 |
|
10 | | -def crawl(url, maxlevel): |
| 10 | +def crawl(url): |
11 | 11 |
|
12 | 12 | result = set() |
13 | 13 |
|
14 | | - while maxlevel > 0: |
| 14 | + req = requests.get(url) |
15 | 15 |
|
16 | | - # Get the webpage |
17 | | - req = requests.get(url) |
| 16 | + # Check if successful |
| 17 | + if(req.status_code != 200): |
| 18 | + return [] |
18 | 19 |
|
19 | | - # Check if successful |
20 | | - if(req.status_code != 200): |
21 | | - return [] |
| 20 | + # Find links |
| 21 | + links = link_re.findall(req.text) |
22 | 22 |
|
23 | | - # Find and follow all the links |
24 | | - links = link_re.findall(req.text) |
25 | | - for link in links: |
26 | | - # Get an absolute URL for a link |
27 | | - link = urlparse.urljoin(url, link) |
| 23 | + print "\nFound {} links".format(len(links)) |
28 | 24 |
|
29 | | - # Find all emails on current page |
30 | | - result.update(email_re.findall(req.text)) |
| 25 | + # Search links for emails |
| 26 | + for link in links: |
31 | 27 |
|
32 | | - print "Crawled level: {}".format(maxlevel) |
| 28 | + # Get an absolute URL for a link |
| 29 | + link = urlparse.urljoin(url, link) |
33 | 30 |
|
34 | | - # new level |
35 | | - maxlevel -= 1 |
36 | | - |
37 | | - # recurse |
38 | | - crawl(link, maxlevel) |
| 31 | + # Find all emails on current page |
| 32 | + result.update(email_re.findall(req.text)) |
39 | 33 |
|
40 | 34 | return result |
41 | 35 |
|
42 | | -emails = crawl('http://www.website_goes_here_dot_com', 2) |
| 36 | +if __name__ == '__main__': |
| 37 | + emails = crawl('http://www.realpython.com') |
43 | 38 |
|
44 | | -print "\nScrapped e-mail addresses:" |
45 | | -for email in emails: |
46 | | - print email |
| 39 | + print "\nScrapped e-mail addresses:" |
| 40 | + for email in emails: |
| 41 | + print email |
| 42 | + print "\n" |
0 commit comments