scrapetools.email_scraper

  1from string import printable
  2from urllib.parse import unquote
  3
  4
  5def validate(email: str) -> bool:
  6    """Checks string to see if it's likely an email address.
  7
  8    Returns True or False.
  9
 10    Some emails violating some of these rules
 11    may technically be valid, but are practically
 12    never seen in use out in the wild."""
 13    if email.count("@") != 1 or email.count(".") == 0:
 14        return False
 15    atdex = email.find("@")
 16    last_dot = email.rfind(".")
 17    local, domain = email.split("@")
 18    # RULES:
 19    #'@' comes before the last '.'
 20    # local part is 64 characters or less
 21    # domain part doesn't contain any '_'
 22    # at least 1 character in local is alphabetical
 23    # 1st character is not '@' or '.'
 24    # last character is not '@' or '.'
 25    # character after '@' is not '.'
 26    # doesn't start with 'www.'
 27    # local is two or more characters
 28    # domain is more than 3 characters
 29    # domain doesn't consist of only numbers
 30    # local doesn't consist of only numbers
 31    # no consecutive '.' in email
 32    # email doesn't contain a listed file ext
 33    if all(
 34        [
 35            atdex < last_dot,
 36            len(local) <= 64,
 37            domain.count("_") == 0,
 38            any(ch.isalpha() for ch in local),
 39            email[0] not in ["@", "."],
 40            email[-1] not in ["@", "."],
 41            email[email.find("@") + 1] != ".",
 42            not email.startswith("www."),
 43            len(local) >= 2,
 44            len(domain) > 3,
 45            not all(ch.isnumeric() for ch in domain.replace(".", "")),
 46            not all(ch.isnumeric() for ch in local.replace(".", "")),
 47            all(email[i - 1] != "." for i, ch in enumerate(email) if ch == "."),
 48            all(
 49                ext not in domain
 50                for ext in [
 51                    ".png",
 52                    ".jpg",
 53                    ".js",
 54                    ".html",
 55                    ".svg",
 56                    ".jpeg",
 57                    ".mp4",
 58                    ".mpeg",
 59                    ".css",
 60                    ".pdf",
 61                    ".wav",
 62                    ".docx",
 63                    ".txt",
 64                    ".rtf",
 65                    ".gif",
 66                    ".webp",
 67                    ".x.x",
 68                ]
 69            ),
 70        ]
 71    ):
 72        return True
 73    else:
 74        return False
 75
 76
 77def find_last_valid_character_offset(text: str) -> int:
 78    """Iterates through a string to find the index of the last valid character,
 79    assuming that string either starts or ends with '@'.
 80
 81    If the string doesn't start or end with '@', an Exception is raised.
 82
 83    Returns the number of valid characters between '@' and first invalid character.
 84    e.g. '@abcde%' will return 5 and '#123@' will return 3.
 85
 86    If no invalid characters are found, the function will return
 87    'len(text)-1'."""
 88
 89    """ Technically some of these characters are valid in an email string,
 90    but the ratio of how often they're used to how often they produce
 91    false positives makes them worth disregarding. """
 92    invalid_characters = " <>[]{},\"':;\\/#$%^&*()=+`?|\n\t\r"
 93    if text[-1] == "@" and text[0] != "@":
 94        # reverse the string
 95        text = text[::-1]
 96    elif text[0] != "@":
 97        raise ValueError(
 98            'First or last character of text arg needs to be "@"\n',
 99            f"Argument {text} is invalid.",
100        )
101    i = 1
102    while i < len(text):
103        if text[i] in invalid_characters or text[i] not in printable:
104            return i - 1
105        else:
106            i += 1
107    return len(text) - 1
108
109
110def strip_unicode(emails: list[str]) -> list[str]:
111    """Removes unicode text that often gets picked
112    up at the front of email addresses and returns the list."""
113    stripped_emails = []
114    for email in emails:
115        for text in ["u003e", "u00a0"]:
116            if text in email:
117                email = email[len(text) + 1 :]
118        stripped_emails.append(email)
119    return stripped_emails
120
121
122def scrape_emails(text: str) -> list[str]:
123    """Extracts potential emails from given text
124    and returns as a list of strings."""
125    if "%" in text:
126        # decode percent encoding
127        text = unquote(text)
128    for ch in ["\n", "\t", "\r"]:
129        text = text.replace(ch, " ")
130    at_count = text.count("@")
131    emails = []
132    if at_count > 0:
133        last_stopdex = 0
134        for i in range(at_count):
135            atdex = text.find("@", last_stopdex)
136            next_atdex = text.find("@", atdex + 1)
137            try:
138                chunk = (
139                    text[last_stopdex:next_atdex]
140                    if next_atdex != -1
141                    else text[last_stopdex:]
142                )
143                chunk_atdex = chunk.find("@")
144                startdex = find_last_valid_character_offset(chunk[: chunk_atdex + 1])
145                stopdex = find_last_valid_character_offset(chunk[chunk_atdex:])
146                email = chunk[chunk_atdex - startdex : stopdex + chunk_atdex + 1]
147                while email[-1].isnumeric() or not email[-1].isalpha():
148                    email = email[:-1]
149                if validate(email):
150                    emails.append(email.lower())
151                """ The extra '+ 1' is to ensure last_stopdex increments
152                if 'len(email.split('@')[1])' is 0."""
153                last_stopdex = atdex + len(email.split("@")[1]) + 1
154            except Exception as e:
155                last_stopdex = atdex + 1
156        emails = sorted(list(set(strip_unicode(emails))))
157    return emails
def validate(email: str) -> bool:
 6def validate(email: str) -> bool:
 7    """Checks string to see if it's likely an email address.
 8
 9    Returns True or False.
10
11    Some emails violating some of these rules
12    may technically be valid, but are practically
13    never seen in use out in the wild."""
14    if email.count("@") != 1 or email.count(".") == 0:
15        return False
16    atdex = email.find("@")
17    last_dot = email.rfind(".")
18    local, domain = email.split("@")
19    # RULES:
20    #'@' comes before the last '.'
21    # local part is 64 characters or less
22    # domain part doesn't contain any '_'
23    # at least 1 character in local is alphabetical
24    # 1st character is not '@' or '.'
25    # last character is not '@' or '.'
26    # character after '@' is not '.'
27    # doesn't start with 'www.'
28    # local is two or more characters
29    # domain is more than 3 characters
30    # domain doesn't consist of only numbers
31    # local doesn't consist of only numbers
32    # no consecutive '.' in email
33    # email doesn't contain a listed file ext
34    if all(
35        [
36            atdex < last_dot,
37            len(local) <= 64,
38            domain.count("_") == 0,
39            any(ch.isalpha() for ch in local),
40            email[0] not in ["@", "."],
41            email[-1] not in ["@", "."],
42            email[email.find("@") + 1] != ".",
43            not email.startswith("www."),
44            len(local) >= 2,
45            len(domain) > 3,
46            not all(ch.isnumeric() for ch in domain.replace(".", "")),
47            not all(ch.isnumeric() for ch in local.replace(".", "")),
48            all(email[i - 1] != "." for i, ch in enumerate(email) if ch == "."),
49            all(
50                ext not in domain
51                for ext in [
52                    ".png",
53                    ".jpg",
54                    ".js",
55                    ".html",
56                    ".svg",
57                    ".jpeg",
58                    ".mp4",
59                    ".mpeg",
60                    ".css",
61                    ".pdf",
62                    ".wav",
63                    ".docx",
64                    ".txt",
65                    ".rtf",
66                    ".gif",
67                    ".webp",
68                    ".x.x",
69                ]
70            ),
71        ]
72    ):
73        return True
74    else:
75        return False

Checks string to see if it's likely an email address.

Returns True or False.

Some emails violating some of these rules may technically be valid, but are practically never seen in use out in the wild.

def find_last_valid_character_offset(text: str) -> int:
 78def find_last_valid_character_offset(text: str) -> int:
 79    """Iterates through a string to find the index of the last valid character,
 80    assuming that string either starts or ends with '@'.
 81
 82    If the string doesn't start or end with '@', an Exception is raised.
 83
 84    Returns the number of valid characters between '@' and first invalid character.
 85    e.g. '@abcde%' will return 5 and '#123@' will return 3.
 86
 87    If no invalid characters are found, the function will return
 88    'len(text)-1'."""
 89
 90    """ Technically some of these characters are valid in an email string,
 91    but the ratio of how often they're used to how often they produce
 92    false positives makes them worth disregarding. """
 93    invalid_characters = " <>[]{},\"':;\\/#$%^&*()=+`?|\n\t\r"
 94    if text[-1] == "@" and text[0] != "@":
 95        # reverse the string
 96        text = text[::-1]
 97    elif text[0] != "@":
 98        raise ValueError(
 99            'First or last character of text arg needs to be "@"\n',
100            f"Argument {text} is invalid.",
101        )
102    i = 1
103    while i < len(text):
104        if text[i] in invalid_characters or text[i] not in printable:
105            return i - 1
106        else:
107            i += 1
108    return len(text) - 1

Iterates through a string to find the index of the last valid character, assuming that string either starts or ends with '@'.

If the string doesn't start or end with '@', an Exception is raised.

Returns the number of valid characters between '@' and first invalid character. e.g. '@abcde%' will return 5 and '#123@' will return 3.

If no invalid characters are found, the function will return 'len(text)-1'.

def strip_unicode(emails: list[str]) -> list[str]:
111def strip_unicode(emails: list[str]) -> list[str]:
112    """Removes unicode text that often gets picked
113    up at the front of email addresses and returns the list."""
114    stripped_emails = []
115    for email in emails:
116        for text in ["u003e", "u00a0"]:
117            if text in email:
118                email = email[len(text) + 1 :]
119        stripped_emails.append(email)
120    return stripped_emails

Removes unicode text that often gets picked up at the front of email addresses and returns the list.

def scrape_emails(text: str) -> list[str]:
123def scrape_emails(text: str) -> list[str]:
124    """Extracts potential emails from given text
125    and returns as a list of strings."""
126    if "%" in text:
127        # decode percent encoding
128        text = unquote(text)
129    for ch in ["\n", "\t", "\r"]:
130        text = text.replace(ch, " ")
131    at_count = text.count("@")
132    emails = []
133    if at_count > 0:
134        last_stopdex = 0
135        for i in range(at_count):
136            atdex = text.find("@", last_stopdex)
137            next_atdex = text.find("@", atdex + 1)
138            try:
139                chunk = (
140                    text[last_stopdex:next_atdex]
141                    if next_atdex != -1
142                    else text[last_stopdex:]
143                )
144                chunk_atdex = chunk.find("@")
145                startdex = find_last_valid_character_offset(chunk[: chunk_atdex + 1])
146                stopdex = find_last_valid_character_offset(chunk[chunk_atdex:])
147                email = chunk[chunk_atdex - startdex : stopdex + chunk_atdex + 1]
148                while email[-1].isnumeric() or not email[-1].isalpha():
149                    email = email[:-1]
150                if validate(email):
151                    emails.append(email.lower())
152                """ The extra '+ 1' is to ensure last_stopdex increments
153                if 'len(email.split('@')[1])' is 0."""
154                last_stopdex = atdex + len(email.split("@")[1]) + 1
155            except Exception as e:
156                last_stopdex = atdex + 1
157        emails = sorted(list(set(strip_unicode(emails))))
158    return emails

Extracts potential emails from given text and returns as a list of strings.