Python-Course-Projects/spider.py at main · crackallcode/Python-Course-Projects

44 lines (33 loc) · 1.06 KB

from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from urllib import *
visited_urls = set()
def spider_urls(url, keyword):
        response = requests.get(url)
    except:
        print(f"Request failed {url}")
        return
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        a_tag = soup.find_all('a')
        urls = []
        for tag in a_tag:
            href = tag.get("href")
            if href is not None and href != "":
                urls.append(href)
        # print(urls)
        for urls2 in urls:
            if urls2 not in visited_urls:
                visited_urls.add(urls2)
                url_join = urljoin(url, urls2)
                if keyword in url_join:
                    print(url_join)
                    spider_urls(url_join, keyword)
            else:
# https://www.yahoo.com
url = input("Eneter the URL you want to scrap. ")
keyword = input("Enter the keyword to search for in the URL provided. ")
spider_urls(url, keyword)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

spider.py

Latest commit

History

spider.py

File metadata and controls