I am trying to build a web crawler to extract all the links on a webpage. I have created 2 python files. (class: scanner.py and object: vulnerability-scanner.py). When I run the script, it is continuously running without stopping. I am unable to find the error. Help me to solve this.
Here is my source code:
scanner.py
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
class Scanner:
colorama.init()
def __init__(self, url):
self.target_url = url
self.target_links = []
def is_valid(self, url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(self, url):
GREEN = colorama.Fore.GREEN
WHITE = colorama.Fore.WHITE
RESET = colorama.Fore.RESET
urls = set()
internal_urls = set()
external_urls = set()
domain_name = urlparse(url).netloc
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
continue
href = urljoin(url, href)
parsed_href = urlparse(href)
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not self.is_valid(href):
continue
if href in internal_urls:
continue
if domain_name not in href:
if href not in external_urls:
print(f"{WHITE}[*] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
urls.add(href)
internal_urls.add(href)
return urls
def crawl(self, url):
href_links = self.get_all_website_links(url)
for link in href_links:
print(link)
self.crawl(link)
vulnerability-scanner.py
import argu
target_url = "https://hack.me/"
vul_scanner = argu.Scanner(target_url)
vul_scanner.crawl(target_url)