scrapetools.email_scraper
1import re 2from string import printable 3from urllib.parse import unquote 4 5 6def validate(email: str) -> bool: 7 """Checks string to see if it's likely an email address. 8 9 Returns True or False. 10 11 Some emails violating some of these rules 12 may technically be valid, but are practically 13 never seen in use out in the wild.""" 14 if email.count("@") != 1 or email.count(".") == 0: 15 return False 16 atdex = email.find("@") 17 last_dot = email.rfind(".") 18 local, domain = email.split("@") 19 # RULES: 20 #'@' comes before the last '.' 21 # local part is 64 characters or less 22 # domain part doesn't contain any '_' 23 # at least 1 character in local is alphabetical 24 # 1st character is not '@' or '.' 25 # last character is not '@' or '.' 26 # character after '@' is not '.' 27 # doesn't start with 'www.' 28 # local is two or more characters 29 # domain is more than 3 characters 30 # domain doesn't consist of only numbers 31 # local doesn't consist of only numbers 32 # no consecutive '.' in email 33 # email doesn't contain a listed file ext 34 if all( 35 [ 36 atdex < last_dot, 37 len(local) <= 64, 38 domain.count("_") == 0, 39 any(ch.isalpha() for ch in local), 40 email[0] not in ["@", "."], 41 email[-1] not in ["@", "."], 42 email[email.find("@") + 1] != ".", 43 not email.startswith("www."), 44 len(local) >= 2, 45 len(domain) > 3, 46 not all(ch.isnumeric() for ch in domain.replace(".", "")), 47 not all(ch.isnumeric() for ch in local.replace(".", "")), 48 all(email[i - 1] != "." for i, ch in enumerate(email) if ch == "."), 49 all( 50 ext not in domain 51 for ext in [ 52 ".png", 53 ".jpg", 54 ".js", 55 ".html", 56 ".svg", 57 ".jpeg", 58 ".mp4", 59 ".mpeg", 60 ".css", 61 ".pdf", 62 ".wav", 63 ".docx", 64 ".txt", 65 ".rtf", 66 ".gif", 67 ".webp", 68 ".x.x", 69 ] 70 ), 71 ] 72 ): 73 return True 74 else: 75 return False 76 77 78def find_last_valid_character_offset(text: str) -> int: 79 """Iterates through a string to find the index of the last valid character, 80 assuming that string either starts or ends with '@'. 81 82 If the string doesn't start or end with '@', an Exception is raised. 83 84 Returns the number of valid characters between '@' and first invalid character. 85 e.g. '@abcde%' will return 5 and '#123@' will return 3. 86 87 If no invalid characters are found, the function will return 88 'len(text)-1'.""" 89 90 """ Technically some of these characters are valid in an email string, 91 but the ratio of how often they're used to how often they produce 92 false positives makes them worth disregarding. """ 93 invalid_characters = " <>[]{},\"':;\\/#$%^&*()=+`?|\n\t\r" 94 if text[-1] == "@" and text[0] != "@": 95 # reverse the string 96 text = text[::-1] 97 elif text[0] != "@": 98 raise ValueError( 99 'First or last character of text arg needs to be "@"\n', 100 f"Argument {text} is invalid.", 101 ) 102 i = 1 103 while i < len(text): 104 if text[i] in invalid_characters or text[i] not in printable: 105 return i - 1 106 else: 107 i += 1 108 return len(text) - 1 109 110 111def strip_unicode(emails: list[str]) -> list[str]: 112 """Removes unicode text that often gets picked 113 up at the front of email addresses and returns the list.""" 114 stripped_emails = [] 115 for email in emails: 116 for text in ["u003e", "u00a0"]: 117 if text in email: 118 email = email[len(text) :] 119 stripped_emails.append(email) 120 return stripped_emails 121 122 123def scrape_emails_noregex(text: str) -> list[str]: 124 """Extracts potential emails from given text 125 and returns as a list of strings.""" 126 if "%" in text: 127 # decode percent encoding 128 text = unquote(text) 129 for ch in ["\n", "\t", "\r"]: 130 text = text.replace(ch, " ") 131 at_count = text.count("@") 132 emails = [] 133 if at_count > 0: 134 last_stopdex = 0 135 for i in range(at_count): 136 atdex = text.find("@", last_stopdex) 137 next_atdex = text.find("@", atdex + 1) 138 try: 139 chunk = ( 140 text[last_stopdex:next_atdex] 141 if next_atdex != -1 142 else text[last_stopdex:] 143 ) 144 chunk_atdex = chunk.find("@") 145 startdex = find_last_valid_character_offset(chunk[: chunk_atdex + 1]) 146 stopdex = find_last_valid_character_offset(chunk[chunk_atdex:]) 147 email = chunk[chunk_atdex - startdex : stopdex + chunk_atdex + 1] 148 while email[-1].isnumeric() or not email[-1].isalpha(): 149 email = email[:-1] 150 if validate(email): 151 emails.append(email.lower()) 152 """ The extra '+ 1' is to ensure last_stopdex increments 153 if 'len(email.split('@')[1])' is 0.""" 154 last_stopdex = atdex + len(email.split("@")[1]) + 1 155 except Exception as e: 156 last_stopdex = atdex + 1 157 emails = sorted(list(set(strip_unicode(emails)))) 158 return emails 159 160 161def filter_out_files( 162 emails: list[str], additional_extensions: list[str] = None 163) -> list[str]: 164 """Filter out emails with file extensions 165 instead of domains. 166 167 :param additional_extensions: Extra file extensions to filter out.""" 168 ext = [ 169 "png", 170 "jpg", 171 "js", 172 "html", 173 "svg", 174 "jpeg", 175 "mp4", 176 "mpeg", 177 "css", 178 "pdf", 179 "wav", 180 "docx", 181 "txt", 182 "rtf", 183 "gif", 184 "webp", 185 "x", 186 ] 187 if additional_extensions: 188 ext.extend([extension.strip(".") for extension in additional_extensions]) 189 ignore = "$|".join(ext) + "$" 190 pattern = r".*[.](?!" + ignore + r")[^.]*$" 191 # Lazy evaluation means we can skip the regex overhead for common domains 192 return [ 193 email 194 for email in emails 195 if email[email.rfind(".") + 1 :] 196 in ["com", "org", "net", "us", "io", "edu", "gov", "biz"] 197 or re.search(pattern, email.lower()) 198 ] 199 200 201def replace_unicodehex(text: str) -> str: 202 """Replace unicode hex strings (u003e etc.) with a space.""" 203 return re.sub(r"u00[a-zA-Z0-9]{2}", " ", text) 204 205 206def scrape_emails(text: str, extra_extensions: list[str] = None) -> list[str]: 207 """Extract emails from text using regex. 208 209 :param text: The text to scrape. 210 211 :param extra_extensions: Extra file extensions to filter out.""" 212 # Remove chunks with no "@" in them to reduce processing 213 text = unquote(" ".join(chunk.lower() for chunk in text.split() if "@" in chunk)) 214 215 # Replace any % encoding or unicode hex strings with spaces 216 text = replace_unicodehex(text) 217 218 # Validation: 219 # Starts with an alphanumeric character. 220 # Local part consists of 1-63 alphanumeric + '._-' characters. 221 # Contains a single '@' character not at the beginning or end of a string. 222 # Domain consists of one or more alphanumeric + '_-' characters 223 # followed by a '.' and one or more alphanumeric + '._-' characters 224 # and ending in an alphabetical character. 225 pattern = ( 226 r"[a-zA-Z0-9]{1}[a-zA-Z0-9._-]{1,63}@[a-zA-Z0-9_-]+\.[a-zA-Z0-9._-]+[a-zA-Z]{1}" 227 ) 228 229 # Match pattern but throw out duplicates and anything that has only numbers in the local part of the address. 230 emails = [ 231 email.lower() 232 for email in set(re.findall(pattern, text)) 233 if not email.split("@")[0].isnumeric() 234 ] 235 # Remove anything that looks like a file and sort the final results 236 return sorted(filter_out_files(emails))
7def validate(email: str) -> bool: 8 """Checks string to see if it's likely an email address. 9 10 Returns True or False. 11 12 Some emails violating some of these rules 13 may technically be valid, but are practically 14 never seen in use out in the wild.""" 15 if email.count("@") != 1 or email.count(".") == 0: 16 return False 17 atdex = email.find("@") 18 last_dot = email.rfind(".") 19 local, domain = email.split("@") 20 # RULES: 21 #'@' comes before the last '.' 22 # local part is 64 characters or less 23 # domain part doesn't contain any '_' 24 # at least 1 character in local is alphabetical 25 # 1st character is not '@' or '.' 26 # last character is not '@' or '.' 27 # character after '@' is not '.' 28 # doesn't start with 'www.' 29 # local is two or more characters 30 # domain is more than 3 characters 31 # domain doesn't consist of only numbers 32 # local doesn't consist of only numbers 33 # no consecutive '.' in email 34 # email doesn't contain a listed file ext 35 if all( 36 [ 37 atdex < last_dot, 38 len(local) <= 64, 39 domain.count("_") == 0, 40 any(ch.isalpha() for ch in local), 41 email[0] not in ["@", "."], 42 email[-1] not in ["@", "."], 43 email[email.find("@") + 1] != ".", 44 not email.startswith("www."), 45 len(local) >= 2, 46 len(domain) > 3, 47 not all(ch.isnumeric() for ch in domain.replace(".", "")), 48 not all(ch.isnumeric() for ch in local.replace(".", "")), 49 all(email[i - 1] != "." for i, ch in enumerate(email) if ch == "."), 50 all( 51 ext not in domain 52 for ext in [ 53 ".png", 54 ".jpg", 55 ".js", 56 ".html", 57 ".svg", 58 ".jpeg", 59 ".mp4", 60 ".mpeg", 61 ".css", 62 ".pdf", 63 ".wav", 64 ".docx", 65 ".txt", 66 ".rtf", 67 ".gif", 68 ".webp", 69 ".x.x", 70 ] 71 ), 72 ] 73 ): 74 return True 75 else: 76 return False
Checks string to see if it's likely an email address.
Returns True or False.
Some emails violating some of these rules may technically be valid, but are practically never seen in use out in the wild.
79def find_last_valid_character_offset(text: str) -> int: 80 """Iterates through a string to find the index of the last valid character, 81 assuming that string either starts or ends with '@'. 82 83 If the string doesn't start or end with '@', an Exception is raised. 84 85 Returns the number of valid characters between '@' and first invalid character. 86 e.g. '@abcde%' will return 5 and '#123@' will return 3. 87 88 If no invalid characters are found, the function will return 89 'len(text)-1'.""" 90 91 """ Technically some of these characters are valid in an email string, 92 but the ratio of how often they're used to how often they produce 93 false positives makes them worth disregarding. """ 94 invalid_characters = " <>[]{},\"':;\\/#$%^&*()=+`?|\n\t\r" 95 if text[-1] == "@" and text[0] != "@": 96 # reverse the string 97 text = text[::-1] 98 elif text[0] != "@": 99 raise ValueError( 100 'First or last character of text arg needs to be "@"\n', 101 f"Argument {text} is invalid.", 102 ) 103 i = 1 104 while i < len(text): 105 if text[i] in invalid_characters or text[i] not in printable: 106 return i - 1 107 else: 108 i += 1 109 return len(text) - 1
Iterates through a string to find the index of the last valid character, assuming that string either starts or ends with '@'.
If the string doesn't start or end with '@', an Exception is raised.
Returns the number of valid characters between '@' and first invalid character. e.g. '@abcde%' will return 5 and '#123@' will return 3.
If no invalid characters are found, the function will return 'len(text)-1'.
112def strip_unicode(emails: list[str]) -> list[str]: 113 """Removes unicode text that often gets picked 114 up at the front of email addresses and returns the list.""" 115 stripped_emails = [] 116 for email in emails: 117 for text in ["u003e", "u00a0"]: 118 if text in email: 119 email = email[len(text) :] 120 stripped_emails.append(email) 121 return stripped_emails
Removes unicode text that often gets picked up at the front of email addresses and returns the list.
124def scrape_emails_noregex(text: str) -> list[str]: 125 """Extracts potential emails from given text 126 and returns as a list of strings.""" 127 if "%" in text: 128 # decode percent encoding 129 text = unquote(text) 130 for ch in ["\n", "\t", "\r"]: 131 text = text.replace(ch, " ") 132 at_count = text.count("@") 133 emails = [] 134 if at_count > 0: 135 last_stopdex = 0 136 for i in range(at_count): 137 atdex = text.find("@", last_stopdex) 138 next_atdex = text.find("@", atdex + 1) 139 try: 140 chunk = ( 141 text[last_stopdex:next_atdex] 142 if next_atdex != -1 143 else text[last_stopdex:] 144 ) 145 chunk_atdex = chunk.find("@") 146 startdex = find_last_valid_character_offset(chunk[: chunk_atdex + 1]) 147 stopdex = find_last_valid_character_offset(chunk[chunk_atdex:]) 148 email = chunk[chunk_atdex - startdex : stopdex + chunk_atdex + 1] 149 while email[-1].isnumeric() or not email[-1].isalpha(): 150 email = email[:-1] 151 if validate(email): 152 emails.append(email.lower()) 153 """ The extra '+ 1' is to ensure last_stopdex increments 154 if 'len(email.split('@')[1])' is 0.""" 155 last_stopdex = atdex + len(email.split("@")[1]) + 1 156 except Exception as e: 157 last_stopdex = atdex + 1 158 emails = sorted(list(set(strip_unicode(emails)))) 159 return emails
Extracts potential emails from given text and returns as a list of strings.
162def filter_out_files( 163 emails: list[str], additional_extensions: list[str] = None 164) -> list[str]: 165 """Filter out emails with file extensions 166 instead of domains. 167 168 :param additional_extensions: Extra file extensions to filter out.""" 169 ext = [ 170 "png", 171 "jpg", 172 "js", 173 "html", 174 "svg", 175 "jpeg", 176 "mp4", 177 "mpeg", 178 "css", 179 "pdf", 180 "wav", 181 "docx", 182 "txt", 183 "rtf", 184 "gif", 185 "webp", 186 "x", 187 ] 188 if additional_extensions: 189 ext.extend([extension.strip(".") for extension in additional_extensions]) 190 ignore = "$|".join(ext) + "$" 191 pattern = r".*[.](?!" + ignore + r")[^.]*$" 192 # Lazy evaluation means we can skip the regex overhead for common domains 193 return [ 194 email 195 for email in emails 196 if email[email.rfind(".") + 1 :] 197 in ["com", "org", "net", "us", "io", "edu", "gov", "biz"] 198 or re.search(pattern, email.lower()) 199 ]
Filter out emails with file extensions instead of domains.
Parameters
- additional_extensions: Extra file extensions to filter out.
202def replace_unicodehex(text: str) -> str: 203 """Replace unicode hex strings (u003e etc.) with a space.""" 204 return re.sub(r"u00[a-zA-Z0-9]{2}", " ", text)
Replace unicode hex strings (u003e etc.) with a space.
207def scrape_emails(text: str, extra_extensions: list[str] = None) -> list[str]: 208 """Extract emails from text using regex. 209 210 :param text: The text to scrape. 211 212 :param extra_extensions: Extra file extensions to filter out.""" 213 # Remove chunks with no "@" in them to reduce processing 214 text = unquote(" ".join(chunk.lower() for chunk in text.split() if "@" in chunk)) 215 216 # Replace any % encoding or unicode hex strings with spaces 217 text = replace_unicodehex(text) 218 219 # Validation: 220 # Starts with an alphanumeric character. 221 # Local part consists of 1-63 alphanumeric + '._-' characters. 222 # Contains a single '@' character not at the beginning or end of a string. 223 # Domain consists of one or more alphanumeric + '_-' characters 224 # followed by a '.' and one or more alphanumeric + '._-' characters 225 # and ending in an alphabetical character. 226 pattern = ( 227 r"[a-zA-Z0-9]{1}[a-zA-Z0-9._-]{1,63}@[a-zA-Z0-9_-]+\.[a-zA-Z0-9._-]+[a-zA-Z]{1}" 228 ) 229 230 # Match pattern but throw out duplicates and anything that has only numbers in the local part of the address. 231 emails = [ 232 email.lower() 233 for email in set(re.findall(pattern, text)) 234 if not email.split("@")[0].isnumeric() 235 ] 236 # Remove anything that looks like a file and sort the final results 237 return sorted(filter_out_files(emails))
Extract emails from text using regex.
Parameters
text: The text to scrape.
extra_extensions: Extra file extensions to filter out.