scrapetools.link_scraper
1from urllib.parse import urlparse, urlunparse 2 3from bs4 import BeautifulSoup 4 5 6class LinkScraper: 7 def __init__(self, html_src: str, page_url: str): 8 self.soup = BeautifulSoup(html_src, features="html.parser") 9 self.parsed_url = urlparse(page_url) 10 self.page_links = [] 11 self.img_links = [] 12 self.script_links = [] 13 14 def format_relative_links(self, links: list[str]) -> list[str]: 15 """Parses list of links and constructs a full url 16 according to self.parsed_url for the ones that don't have a 17 'netloc' property returned by urlparse. 18 19 Full urls are returned unedited other than stripping any 20 leading or trailing forward slashes.""" 21 formatted_links = [] 22 for link in links: 23 link = ( 24 link.strip(" \n\t\r") 25 .replace('"', "") 26 .replace("\\", "") 27 .replace("'", "") 28 ) 29 parsed_url = urlparse(link) 30 if all(ch not in link for ch in "@ "): 31 parsed_url = list(parsed_url) 32 if parsed_url[0] == "": 33 parsed_url[0] = self.parsed_url.scheme 34 if parsed_url[1] == "": 35 parsed_url[1] = self.parsed_url.netloc 36 formatted_links.append(urlunparse(parsed_url).strip("/")) 37 return formatted_links 38 39 def remove_duplicates(self, obj: list) -> list: 40 """Removes duplicate members.""" 41 return list(set(obj)) 42 43 def process_links(self, links: list[str]) -> list[str]: 44 """Formats relative links, removes duplicates, and sorts in alphabetical order.""" 45 return sorted(self.remove_duplicates(self.format_relative_links(links))) 46 47 def find_all(self, tag_name: str, attribute_name: str) -> list[str]: 48 """Finds all results according to tag_name and attribute_name.\n 49 Filters out fragments.""" 50 return [ 51 tag.get(attribute_name) 52 for tag in self.soup(tag_name, recursive=True) 53 if tag.get(attribute_name) is not None 54 and "#" not in tag.get(attribute_name) 55 ] 56 57 def filter_same_site(self, links: list[str]) -> list[str]: 58 """Filters out links that don't match self.parsed_url.netloc""" 59 return [ 60 link 61 for link in links 62 if urlparse(link).netloc.strip("www.") 63 == self.parsed_url.netloc.strip("www.") 64 ] 65 66 def scrape_page_links(self): 67 """Scrape links according to tags and attributes.""" 68 links = [] 69 for tag, attribute in [ 70 ("a", "href"), 71 ("link", "href"), 72 ("source", "src"), 73 ("div", "src"), 74 ("div", "data-src"), 75 ("div", "data-url"), 76 ("div", "href"), 77 ]: 78 links.extend(self.find_all(tag, attribute)) 79 self.page_links = self.process_links(links) 80 81 def scrape_img_links(self): 82 """Scrape links from src attribute of <img> tags.""" 83 self.img_links = self.process_links( 84 self.find_all("img", "src") + self.find_all("img", "data-src") 85 ) 86 87 def scrape_script_links(self): 88 """Scrape script links from src attribute of <script> tags.""" 89 self.script_links = self.process_links(self.find_all("script", "src")) 90 91 def scrape_page(self): 92 """Scrape all link types.""" 93 for scrape in [ 94 self.scrape_page_links, 95 self.scrape_img_links, 96 self.scrape_script_links, 97 ]: 98 scrape() 99 self.merge_image_links_from_non_img_tags() 100 101 def merge_image_links_from_non_img_tags(self): 102 """Finds links in self.script_links and self.page_links 103 that have one of these image file extensions and adds them 104 to self.img_links""" 105 formats = [ 106 ".jpg", 107 ".jpeg", 108 ".png", 109 ".svg", 110 ".bmp", 111 ".tiff", 112 ".pdf", 113 ".eps", 114 ".gif", 115 ".jfif", 116 ".webp", 117 ".heif", 118 ".avif", 119 ".bat", 120 ".bpg", 121 ] 122 for link in self.script_links + self.page_links: 123 if any(ext in link for ext in formats): 124 self.img_links.append(link) 125 self.img_links = sorted(self.remove_duplicates(self.img_links)) 126 127 def get_links( 128 self, 129 link_type: str = "all", 130 same_site_only: bool = False, 131 excluded_links: list[str] = None, 132 ) -> list[str]: 133 """Returns a list of urls found on the page. 134 135 :param link_type: Can be 'all', 'page', 'img', or 'script'. 136 137 :param same_site_only: Excludes external urls if True. 138 139 :param excluded_links: A list of urls to filter out of the results. 140 Useful for excluding duplicates when recursively scraping a website. 141 Can also be used with link_type='all' to get two link types in one call: 142 143 e.g. links = scraper.get_links(link_type = 'all', excluded_links = scraper.script_links) 144 will return page links and img links.""" 145 match link_type: 146 case "all": 147 links = self.remove_duplicates( 148 self.page_links + self.img_links + self.script_links 149 ) 150 case "page": 151 links = self.page_links 152 case "img": 153 links = self.img_links 154 case "script": 155 links = self.script_links 156 if same_site_only: 157 links = self.filter_same_site(links) 158 if excluded_links: 159 links = [link for link in links if link not in excluded_links] 160 return sorted(links)
class
LinkScraper:
7class LinkScraper: 8 def __init__(self, html_src: str, page_url: str): 9 self.soup = BeautifulSoup(html_src, features="html.parser") 10 self.parsed_url = urlparse(page_url) 11 self.page_links = [] 12 self.img_links = [] 13 self.script_links = [] 14 15 def format_relative_links(self, links: list[str]) -> list[str]: 16 """Parses list of links and constructs a full url 17 according to self.parsed_url for the ones that don't have a 18 'netloc' property returned by urlparse. 19 20 Full urls are returned unedited other than stripping any 21 leading or trailing forward slashes.""" 22 formatted_links = [] 23 for link in links: 24 link = ( 25 link.strip(" \n\t\r") 26 .replace('"', "") 27 .replace("\\", "") 28 .replace("'", "") 29 ) 30 parsed_url = urlparse(link) 31 if all(ch not in link for ch in "@ "): 32 parsed_url = list(parsed_url) 33 if parsed_url[0] == "": 34 parsed_url[0] = self.parsed_url.scheme 35 if parsed_url[1] == "": 36 parsed_url[1] = self.parsed_url.netloc 37 formatted_links.append(urlunparse(parsed_url).strip("/")) 38 return formatted_links 39 40 def remove_duplicates(self, obj: list) -> list: 41 """Removes duplicate members.""" 42 return list(set(obj)) 43 44 def process_links(self, links: list[str]) -> list[str]: 45 """Formats relative links, removes duplicates, and sorts in alphabetical order.""" 46 return sorted(self.remove_duplicates(self.format_relative_links(links))) 47 48 def find_all(self, tag_name: str, attribute_name: str) -> list[str]: 49 """Finds all results according to tag_name and attribute_name.\n 50 Filters out fragments.""" 51 return [ 52 tag.get(attribute_name) 53 for tag in self.soup(tag_name, recursive=True) 54 if tag.get(attribute_name) is not None 55 and "#" not in tag.get(attribute_name) 56 ] 57 58 def filter_same_site(self, links: list[str]) -> list[str]: 59 """Filters out links that don't match self.parsed_url.netloc""" 60 return [ 61 link 62 for link in links 63 if urlparse(link).netloc.strip("www.") 64 == self.parsed_url.netloc.strip("www.") 65 ] 66 67 def scrape_page_links(self): 68 """Scrape links according to tags and attributes.""" 69 links = [] 70 for tag, attribute in [ 71 ("a", "href"), 72 ("link", "href"), 73 ("source", "src"), 74 ("div", "src"), 75 ("div", "data-src"), 76 ("div", "data-url"), 77 ("div", "href"), 78 ]: 79 links.extend(self.find_all(tag, attribute)) 80 self.page_links = self.process_links(links) 81 82 def scrape_img_links(self): 83 """Scrape links from src attribute of <img> tags.""" 84 self.img_links = self.process_links( 85 self.find_all("img", "src") + self.find_all("img", "data-src") 86 ) 87 88 def scrape_script_links(self): 89 """Scrape script links from src attribute of <script> tags.""" 90 self.script_links = self.process_links(self.find_all("script", "src")) 91 92 def scrape_page(self): 93 """Scrape all link types.""" 94 for scrape in [ 95 self.scrape_page_links, 96 self.scrape_img_links, 97 self.scrape_script_links, 98 ]: 99 scrape() 100 self.merge_image_links_from_non_img_tags() 101 102 def merge_image_links_from_non_img_tags(self): 103 """Finds links in self.script_links and self.page_links 104 that have one of these image file extensions and adds them 105 to self.img_links""" 106 formats = [ 107 ".jpg", 108 ".jpeg", 109 ".png", 110 ".svg", 111 ".bmp", 112 ".tiff", 113 ".pdf", 114 ".eps", 115 ".gif", 116 ".jfif", 117 ".webp", 118 ".heif", 119 ".avif", 120 ".bat", 121 ".bpg", 122 ] 123 for link in self.script_links + self.page_links: 124 if any(ext in link for ext in formats): 125 self.img_links.append(link) 126 self.img_links = sorted(self.remove_duplicates(self.img_links)) 127 128 def get_links( 129 self, 130 link_type: str = "all", 131 same_site_only: bool = False, 132 excluded_links: list[str] = None, 133 ) -> list[str]: 134 """Returns a list of urls found on the page. 135 136 :param link_type: Can be 'all', 'page', 'img', or 'script'. 137 138 :param same_site_only: Excludes external urls if True. 139 140 :param excluded_links: A list of urls to filter out of the results. 141 Useful for excluding duplicates when recursively scraping a website. 142 Can also be used with link_type='all' to get two link types in one call: 143 144 e.g. links = scraper.get_links(link_type = 'all', excluded_links = scraper.script_links) 145 will return page links and img links.""" 146 match link_type: 147 case "all": 148 links = self.remove_duplicates( 149 self.page_links + self.img_links + self.script_links 150 ) 151 case "page": 152 links = self.page_links 153 case "img": 154 links = self.img_links 155 case "script": 156 links = self.script_links 157 if same_site_only: 158 links = self.filter_same_site(links) 159 if excluded_links: 160 links = [link for link in links if link not in excluded_links] 161 return sorted(links)
def
format_relative_links(self, links: list[str]) -> list[str]:
15 def format_relative_links(self, links: list[str]) -> list[str]: 16 """Parses list of links and constructs a full url 17 according to self.parsed_url for the ones that don't have a 18 'netloc' property returned by urlparse. 19 20 Full urls are returned unedited other than stripping any 21 leading or trailing forward slashes.""" 22 formatted_links = [] 23 for link in links: 24 link = ( 25 link.strip(" \n\t\r") 26 .replace('"', "") 27 .replace("\\", "") 28 .replace("'", "") 29 ) 30 parsed_url = urlparse(link) 31 if all(ch not in link for ch in "@ "): 32 parsed_url = list(parsed_url) 33 if parsed_url[0] == "": 34 parsed_url[0] = self.parsed_url.scheme 35 if parsed_url[1] == "": 36 parsed_url[1] = self.parsed_url.netloc 37 formatted_links.append(urlunparse(parsed_url).strip("/")) 38 return formatted_links
Parses list of links and constructs a full url according to self.parsed_url for the ones that don't have a 'netloc' property returned by urlparse.
Full urls are returned unedited other than stripping any leading or trailing forward slashes.
def
remove_duplicates(self, obj: list) -> list:
40 def remove_duplicates(self, obj: list) -> list: 41 """Removes duplicate members.""" 42 return list(set(obj))
Removes duplicate members.
def
process_links(self, links: list[str]) -> list[str]:
44 def process_links(self, links: list[str]) -> list[str]: 45 """Formats relative links, removes duplicates, and sorts in alphabetical order.""" 46 return sorted(self.remove_duplicates(self.format_relative_links(links)))
Formats relative links, removes duplicates, and sorts in alphabetical order.
def
find_all(self, tag_name: str, attribute_name: str) -> list[str]:
48 def find_all(self, tag_name: str, attribute_name: str) -> list[str]: 49 """Finds all results according to tag_name and attribute_name.\n 50 Filters out fragments.""" 51 return [ 52 tag.get(attribute_name) 53 for tag in self.soup(tag_name, recursive=True) 54 if tag.get(attribute_name) is not None 55 and "#" not in tag.get(attribute_name) 56 ]
Finds all results according to tag_name and attribute_name.
Filters out fragments.
def
filter_same_site(self, links: list[str]) -> list[str]:
58 def filter_same_site(self, links: list[str]) -> list[str]: 59 """Filters out links that don't match self.parsed_url.netloc""" 60 return [ 61 link 62 for link in links 63 if urlparse(link).netloc.strip("www.") 64 == self.parsed_url.netloc.strip("www.") 65 ]
Filters out links that don't match self.parsed_url.netloc
def
scrape_page_links(self):
67 def scrape_page_links(self): 68 """Scrape links according to tags and attributes.""" 69 links = [] 70 for tag, attribute in [ 71 ("a", "href"), 72 ("link", "href"), 73 ("source", "src"), 74 ("div", "src"), 75 ("div", "data-src"), 76 ("div", "data-url"), 77 ("div", "href"), 78 ]: 79 links.extend(self.find_all(tag, attribute)) 80 self.page_links = self.process_links(links)
Scrape links according to tags and attributes.