scrapetools.email_scraper
1from string import printable 2from urllib.parse import unquote 3 4 5def validate(email: str) -> bool: 6 """Checks string to see if it's likely an email address. 7 8 Returns True or False. 9 10 Some emails violating some of these rules 11 may technically be valid, but are practically 12 never seen in use out in the wild.""" 13 if email.count("@") != 1 or email.count(".") == 0: 14 return False 15 atdex = email.find("@") 16 last_dot = email.rfind(".") 17 local, domain = email.split("@") 18 # RULES: 19 #'@' comes before the last '.' 20 # local part is 64 characters or less 21 # domain part doesn't contain any '_' 22 # at least 1 character in local is alphabetical 23 # 1st character is not '@' or '.' 24 # last character is not '@' or '.' 25 # character after '@' is not '.' 26 # doesn't start with 'www.' 27 # local is two or more characters 28 # domain is more than 3 characters 29 # domain doesn't consist of only numbers 30 # local doesn't consist of only numbers 31 # no consecutive '.' in email 32 # email doesn't contain a listed file ext 33 if all( 34 [ 35 atdex < last_dot, 36 len(local) <= 64, 37 domain.count("_") == 0, 38 any(ch.isalpha() for ch in local), 39 email[0] not in ["@", "."], 40 email[-1] not in ["@", "."], 41 email[email.find("@") + 1] != ".", 42 not email.startswith("www."), 43 len(local) >= 2, 44 len(domain) > 3, 45 not all(ch.isnumeric() for ch in domain.replace(".", "")), 46 not all(ch.isnumeric() for ch in local.replace(".", "")), 47 all(email[i - 1] != "." for i, ch in enumerate(email) if ch == "."), 48 all( 49 ext not in domain 50 for ext in [ 51 ".png", 52 ".jpg", 53 ".js", 54 ".html", 55 ".svg", 56 ".jpeg", 57 ".mp4", 58 ".mpeg", 59 ".css", 60 ".pdf", 61 ".wav", 62 ".docx", 63 ".txt", 64 ".rtf", 65 ".gif", 66 ".webp", 67 ".x.x", 68 ] 69 ), 70 ] 71 ): 72 return True 73 else: 74 return False 75 76 77def find_last_valid_character_offset(text: str) -> int: 78 """Iterates through a string to find the index of the last valid character, 79 assuming that string either starts or ends with '@'. 80 81 If the string doesn't start or end with '@', an Exception is raised. 82 83 Returns the number of valid characters between '@' and first invalid character. 84 e.g. '@abcde%' will return 5 and '#123@' will return 3. 85 86 If no invalid characters are found, the function will return 87 'len(text)-1'.""" 88 89 """ Technically some of these characters are valid in an email string, 90 but the ratio of how often they're used to how often they produce 91 false positives makes them worth disregarding. """ 92 invalid_characters = " <>[]{},\"':;\\/#$%^&*()=+`?|\n\t\r" 93 if text[-1] == "@" and text[0] != "@": 94 # reverse the string 95 text = text[::-1] 96 elif text[0] != "@": 97 raise ValueError( 98 'First or last character of text arg needs to be "@"\n', 99 f"Argument {text} is invalid.", 100 ) 101 i = 1 102 while i < len(text): 103 if text[i] in invalid_characters or text[i] not in printable: 104 return i - 1 105 else: 106 i += 1 107 return len(text) - 1 108 109 110def strip_unicode(emails: list[str]) -> list[str]: 111 """Removes unicode text that often gets picked 112 up at the front of email addresses and returns the list.""" 113 stripped_emails = [] 114 for email in emails: 115 for text in ["u003e", "u00a0"]: 116 if text in email: 117 email = email[len(text) + 1 :] 118 stripped_emails.append(email) 119 return stripped_emails 120 121 122def scrape_emails(text: str) -> list[str]: 123 """Extracts potential emails from given text 124 and returns as a list of strings.""" 125 if "%" in text: 126 # decode percent encoding 127 text = unquote(text) 128 for ch in ["\n", "\t", "\r"]: 129 text = text.replace(ch, " ") 130 at_count = text.count("@") 131 emails = [] 132 if at_count > 0: 133 last_stopdex = 0 134 for i in range(at_count): 135 atdex = text.find("@", last_stopdex) 136 next_atdex = text.find("@", atdex + 1) 137 try: 138 chunk = ( 139 text[last_stopdex:next_atdex] 140 if next_atdex != -1 141 else text[last_stopdex:] 142 ) 143 chunk_atdex = chunk.find("@") 144 startdex = find_last_valid_character_offset(chunk[: chunk_atdex + 1]) 145 stopdex = find_last_valid_character_offset(chunk[chunk_atdex:]) 146 email = chunk[chunk_atdex - startdex : stopdex + chunk_atdex + 1] 147 while email[-1].isnumeric() or not email[-1].isalpha(): 148 email = email[:-1] 149 if validate(email): 150 emails.append(email.lower()) 151 """ The extra '+ 1' is to ensure last_stopdex increments 152 if 'len(email.split('@')[1])' is 0.""" 153 last_stopdex = atdex + len(email.split("@")[1]) + 1 154 except Exception as e: 155 last_stopdex = atdex + 1 156 emails = sorted(list(set(strip_unicode(emails)))) 157 return emails
6def validate(email: str) -> bool: 7 """Checks string to see if it's likely an email address. 8 9 Returns True or False. 10 11 Some emails violating some of these rules 12 may technically be valid, but are practically 13 never seen in use out in the wild.""" 14 if email.count("@") != 1 or email.count(".") == 0: 15 return False 16 atdex = email.find("@") 17 last_dot = email.rfind(".") 18 local, domain = email.split("@") 19 # RULES: 20 #'@' comes before the last '.' 21 # local part is 64 characters or less 22 # domain part doesn't contain any '_' 23 # at least 1 character in local is alphabetical 24 # 1st character is not '@' or '.' 25 # last character is not '@' or '.' 26 # character after '@' is not '.' 27 # doesn't start with 'www.' 28 # local is two or more characters 29 # domain is more than 3 characters 30 # domain doesn't consist of only numbers 31 # local doesn't consist of only numbers 32 # no consecutive '.' in email 33 # email doesn't contain a listed file ext 34 if all( 35 [ 36 atdex < last_dot, 37 len(local) <= 64, 38 domain.count("_") == 0, 39 any(ch.isalpha() for ch in local), 40 email[0] not in ["@", "."], 41 email[-1] not in ["@", "."], 42 email[email.find("@") + 1] != ".", 43 not email.startswith("www."), 44 len(local) >= 2, 45 len(domain) > 3, 46 not all(ch.isnumeric() for ch in domain.replace(".", "")), 47 not all(ch.isnumeric() for ch in local.replace(".", "")), 48 all(email[i - 1] != "." for i, ch in enumerate(email) if ch == "."), 49 all( 50 ext not in domain 51 for ext in [ 52 ".png", 53 ".jpg", 54 ".js", 55 ".html", 56 ".svg", 57 ".jpeg", 58 ".mp4", 59 ".mpeg", 60 ".css", 61 ".pdf", 62 ".wav", 63 ".docx", 64 ".txt", 65 ".rtf", 66 ".gif", 67 ".webp", 68 ".x.x", 69 ] 70 ), 71 ] 72 ): 73 return True 74 else: 75 return False
Checks string to see if it's likely an email address.
Returns True or False.
Some emails violating some of these rules may technically be valid, but are practically never seen in use out in the wild.
78def find_last_valid_character_offset(text: str) -> int: 79 """Iterates through a string to find the index of the last valid character, 80 assuming that string either starts or ends with '@'. 81 82 If the string doesn't start or end with '@', an Exception is raised. 83 84 Returns the number of valid characters between '@' and first invalid character. 85 e.g. '@abcde%' will return 5 and '#123@' will return 3. 86 87 If no invalid characters are found, the function will return 88 'len(text)-1'.""" 89 90 """ Technically some of these characters are valid in an email string, 91 but the ratio of how often they're used to how often they produce 92 false positives makes them worth disregarding. """ 93 invalid_characters = " <>[]{},\"':;\\/#$%^&*()=+`?|\n\t\r" 94 if text[-1] == "@" and text[0] != "@": 95 # reverse the string 96 text = text[::-1] 97 elif text[0] != "@": 98 raise ValueError( 99 'First or last character of text arg needs to be "@"\n', 100 f"Argument {text} is invalid.", 101 ) 102 i = 1 103 while i < len(text): 104 if text[i] in invalid_characters or text[i] not in printable: 105 return i - 1 106 else: 107 i += 1 108 return len(text) - 1
Iterates through a string to find the index of the last valid character, assuming that string either starts or ends with '@'.
If the string doesn't start or end with '@', an Exception is raised.
Returns the number of valid characters between '@' and first invalid character. e.g. '@abcde%' will return 5 and '#123@' will return 3.
If no invalid characters are found, the function will return 'len(text)-1'.
111def strip_unicode(emails: list[str]) -> list[str]: 112 """Removes unicode text that often gets picked 113 up at the front of email addresses and returns the list.""" 114 stripped_emails = [] 115 for email in emails: 116 for text in ["u003e", "u00a0"]: 117 if text in email: 118 email = email[len(text) + 1 :] 119 stripped_emails.append(email) 120 return stripped_emails
Removes unicode text that often gets picked up at the front of email addresses and returns the list.
123def scrape_emails(text: str) -> list[str]: 124 """Extracts potential emails from given text 125 and returns as a list of strings.""" 126 if "%" in text: 127 # decode percent encoding 128 text = unquote(text) 129 for ch in ["\n", "\t", "\r"]: 130 text = text.replace(ch, " ") 131 at_count = text.count("@") 132 emails = [] 133 if at_count > 0: 134 last_stopdex = 0 135 for i in range(at_count): 136 atdex = text.find("@", last_stopdex) 137 next_atdex = text.find("@", atdex + 1) 138 try: 139 chunk = ( 140 text[last_stopdex:next_atdex] 141 if next_atdex != -1 142 else text[last_stopdex:] 143 ) 144 chunk_atdex = chunk.find("@") 145 startdex = find_last_valid_character_offset(chunk[: chunk_atdex + 1]) 146 stopdex = find_last_valid_character_offset(chunk[chunk_atdex:]) 147 email = chunk[chunk_atdex - startdex : stopdex + chunk_atdex + 1] 148 while email[-1].isnumeric() or not email[-1].isalpha(): 149 email = email[:-1] 150 if validate(email): 151 emails.append(email.lower()) 152 """ The extra '+ 1' is to ensure last_stopdex increments 153 if 'len(email.split('@')[1])' is 0.""" 154 last_stopdex = atdex + len(email.split("@")[1]) + 1 155 except Exception as e: 156 last_stopdex = atdex + 1 157 emails = sorted(list(set(strip_unicode(emails)))) 158 return emails
Extracts potential emails from given text and returns as a list of strings.