import requests from bs4 import BeautifulSoup def getOutBoundURLs(anchors): '''http://www.w3.org/TR/html5/text-level-semantics.html#the-a-element anchors contains all \ tag elements from the HTML content. Iterate through the list of anchors and build list containing the href addresses whenever it is available. @author kgashok ''' addressList = [] for anchor in anchors: try: linkaddress = anchor.attrs['href'] addressList.append(linkaddress) except BaseException: print("Ignoring", anchor) return addressList def getOutBoundHttpURLs(alist): return [address for address in alist if address.startswith("https:")] def generateBanner(url, anchorCount, outBoundCount, linkCount): return ''' ############################# ---- Outbound links for {0} ({1}, {2}, https: {3}) ############################# '''.format(url, str(anchorCount), str(outBoundCount), str(linkCount)) def printURLs(url, anchors, f=None): ''' Print only those addresses that start with 'https' from valid anchors if 'f'ilename is valid, write extracted URLs to file as well ''' outBoundURLs = getOutBoundURLs(anchors) hhrefs = getOutBoundHttpURLs(outBoundURLs) banner = generateBanner(url, len(anchors), len(outBoundURLs), len(hhrefs)) print(banner) if f: f.write(banner) for linkaddress in hhrefs: print(linkaddress) if f: f.write(url + " -> " + linkaddress + "\n") urls = ["http://www.python.org", "http://www.facebook.com", "http://www.kct.ac.in", "http://www.psgtech.edu", "http://www.kgkite.ac.in" # , "http://www.kghospital.com" # , "http://www.kgisl.com" , "http://www.sece.ac.in", "http://www.siet.ac.in", "http://www.bennett.edu.in" ] dataFile = open("data.txt", "w") for url in urls: html = requests.get(url) soup = BeautifulSoup(html.text) anchors = soup.find_all("a") # equivalent one-liner # anchors = BeautifulSoup(requests.get("http://www.python.org").text).find_all('a') printURLs(url, anchors, dataFile) dataFile.close()