scrapetools.phone_scraper
1import re 2 3import phonenumbers 4from bs4 import BeautifulSoup 5 6 7def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int: 8 """Finds the number of consecutive numeric characters in a string.""" 9 # limit search to 10 characters 10 text[:10] 11 if reverse: 12 text = text[::-1] 13 for i, ch in enumerate(text): 14 if not ch.isnumeric(): 15 return i 16 return len(text) 17 18 19def find_by_separator(text: str, separator: str) -> list[str]: 20 """Attempts to detect phone numbers according to these 21 patterns by scanning for separators (typically '-.') 22 and how many consecutive numbers follow or precede them: 23 24 (xxx)xxx{separator}xxxx 25 26 (xxx) xxx{separator}xxxx 27 28 (xxx){separator}xxx{separator}xxxx 29 30 xxx{separator}xxx{separator}xxxx""" 31 count = text.count(separator) 32 numbers = [] 33 if count > 0: 34 last_stopdex = 0 35 for _ in range(count): 36 number = "" 37 sepdex = text.find(separator, last_stopdex) 38 if sepdex != -1: 39 next_sepdex = text.find(separator, sepdex + 1) 40 # consecutive numbers preceding sepdex 41 start_offset = get_num_consecutive_numbers( 42 text[last_stopdex:sepdex], reverse=True 43 ) 44 # consecutive numbers between sepdex and next_sepdex 45 first_stop_offset = get_num_consecutive_numbers( 46 text[sepdex + 1 : next_sepdex + 1] 47 ) 48 # consecutive numbers after next_sepdex 49 second_stop_offset = get_num_consecutive_numbers( 50 text[next_sepdex + 1 :] 51 ) 52 53 if ( 54 start_offset == 3 55 and first_stop_offset == 3 56 and second_stop_offset == 4 57 ): 58 # xxx{separator}xxx{separator}xxxx 59 number = text[ 60 sepdex - start_offset : next_sepdex + second_stop_offset + 1 61 ] 62 elif ( 63 start_offset == 0 64 and first_stop_offset == 3 65 and second_stop_offset == 4 66 and text[sepdex - 1] == ")" 67 and text[sepdex - 5] == "(" 68 ): 69 # (xxx){separator}xxx{separator}xxxx 70 number = text[ 71 sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2 72 ] 73 elif start_offset == 3 and text[sepdex - 4] in [")", " "]: 74 # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx 75 number = text[sepdex - 8 : sepdex + 5] 76 last_stopdex = sepdex + 5 77 for ch in [separator, "(", ")", " "]: 78 number = number.replace(ch, "") 79 if len(number) == 10 and all(ch.isnumeric() for ch in number): 80 numbers.append(number) 81 return numbers 82 83 84def find_by_href(text: str) -> list[str]: 85 """Scrapes phone numbers by href attribute.""" 86 indicator = 'href="' 87 count = text.count(indicator) 88 prefixes = ["tel:", "callto:"] 89 index = 0 90 numbers = [] 91 for _ in range(count): 92 index = text.find(indicator, index + 1) 93 number = text[index + len(indicator) : text.find('"', index + len(indicator))] 94 if any(prefix in number for prefix in prefixes): 95 number = "".join( 96 [num for num in number[number.find(":") + 1 :] if num.isnumeric()] 97 ) 98 if len(number) == 10: 99 numbers.append(number) 100 return numbers 101 102 103def scrape_phone_numbers_noregex(text: str) -> list[str]: 104 """Scrape for u.s. phone numbers.""" 105 numbers = [] 106 text = text.replace("+1", "") 107 for separator in "-.": 108 numbers.extend(find_by_separator(text, separator)) 109 numbers.extend(find_by_href(text)) 110 numbers = [ 111 number 112 for number in numbers 113 if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number)) 114 ] 115 numbers = sorted(list(set(numbers))) 116 return numbers 117 118 119def scrape_phone_numbers(text: str) -> list[str]: 120 """Scrape phone numbers from text using regex.""" 121 text = text.replace("+1", " ") 122 text = re.sub("[a-zA-Z]", "", text) 123 pattern = r"\(?[2-9]{1}[0-9]{2}\)?[ .-]{1}[2-9]{1}[0-9]{2}[ .-]{1}[0-9]{4}" 124 numbers = [re.sub(r"[^0-9]", "", number) for number in re.findall(pattern, text)] 125 numbers = [ 126 number 127 for number in numbers 128 if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number)) 129 ] 130 return sorted(set(numbers))
def
get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
8def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int: 9 """Finds the number of consecutive numeric characters in a string.""" 10 # limit search to 10 characters 11 text[:10] 12 if reverse: 13 text = text[::-1] 14 for i, ch in enumerate(text): 15 if not ch.isnumeric(): 16 return i 17 return len(text)
Finds the number of consecutive numeric characters in a string.
def
find_by_separator(text: str, separator: str) -> list[str]:
20def find_by_separator(text: str, separator: str) -> list[str]: 21 """Attempts to detect phone numbers according to these 22 patterns by scanning for separators (typically '-.') 23 and how many consecutive numbers follow or precede them: 24 25 (xxx)xxx{separator}xxxx 26 27 (xxx) xxx{separator}xxxx 28 29 (xxx){separator}xxx{separator}xxxx 30 31 xxx{separator}xxx{separator}xxxx""" 32 count = text.count(separator) 33 numbers = [] 34 if count > 0: 35 last_stopdex = 0 36 for _ in range(count): 37 number = "" 38 sepdex = text.find(separator, last_stopdex) 39 if sepdex != -1: 40 next_sepdex = text.find(separator, sepdex + 1) 41 # consecutive numbers preceding sepdex 42 start_offset = get_num_consecutive_numbers( 43 text[last_stopdex:sepdex], reverse=True 44 ) 45 # consecutive numbers between sepdex and next_sepdex 46 first_stop_offset = get_num_consecutive_numbers( 47 text[sepdex + 1 : next_sepdex + 1] 48 ) 49 # consecutive numbers after next_sepdex 50 second_stop_offset = get_num_consecutive_numbers( 51 text[next_sepdex + 1 :] 52 ) 53 54 if ( 55 start_offset == 3 56 and first_stop_offset == 3 57 and second_stop_offset == 4 58 ): 59 # xxx{separator}xxx{separator}xxxx 60 number = text[ 61 sepdex - start_offset : next_sepdex + second_stop_offset + 1 62 ] 63 elif ( 64 start_offset == 0 65 and first_stop_offset == 3 66 and second_stop_offset == 4 67 and text[sepdex - 1] == ")" 68 and text[sepdex - 5] == "(" 69 ): 70 # (xxx){separator}xxx{separator}xxxx 71 number = text[ 72 sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2 73 ] 74 elif start_offset == 3 and text[sepdex - 4] in [")", " "]: 75 # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx 76 number = text[sepdex - 8 : sepdex + 5] 77 last_stopdex = sepdex + 5 78 for ch in [separator, "(", ")", " "]: 79 number = number.replace(ch, "") 80 if len(number) == 10 and all(ch.isnumeric() for ch in number): 81 numbers.append(number) 82 return numbers
Attempts to detect phone numbers according to these patterns by scanning for separators (typically '-.') and how many consecutive numbers follow or precede them:
(xxx)xxx{separator}xxxx
(xxx) xxx{separator}xxxx
(xxx){separator}xxx{separator}xxxx
xxx{separator}xxx{separator}xxxx
def
find_by_href(text: str) -> list[str]:
85def find_by_href(text: str) -> list[str]: 86 """Scrapes phone numbers by href attribute.""" 87 indicator = 'href="' 88 count = text.count(indicator) 89 prefixes = ["tel:", "callto:"] 90 index = 0 91 numbers = [] 92 for _ in range(count): 93 index = text.find(indicator, index + 1) 94 number = text[index + len(indicator) : text.find('"', index + len(indicator))] 95 if any(prefix in number for prefix in prefixes): 96 number = "".join( 97 [num for num in number[number.find(":") + 1 :] if num.isnumeric()] 98 ) 99 if len(number) == 10: 100 numbers.append(number) 101 return numbers
Scrapes phone numbers by href attribute.
def
scrape_phone_numbers_noregex(text: str) -> list[str]:
104def scrape_phone_numbers_noregex(text: str) -> list[str]: 105 """Scrape for u.s. phone numbers.""" 106 numbers = [] 107 text = text.replace("+1", "") 108 for separator in "-.": 109 numbers.extend(find_by_separator(text, separator)) 110 numbers.extend(find_by_href(text)) 111 numbers = [ 112 number 113 for number in numbers 114 if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number)) 115 ] 116 numbers = sorted(list(set(numbers))) 117 return numbers
Scrape for u.s. phone numbers.
def
scrape_phone_numbers(text: str) -> list[str]:
120def scrape_phone_numbers(text: str) -> list[str]: 121 """Scrape phone numbers from text using regex.""" 122 text = text.replace("+1", " ") 123 text = re.sub("[a-zA-Z]", "", text) 124 pattern = r"\(?[2-9]{1}[0-9]{2}\)?[ .-]{1}[2-9]{1}[0-9]{2}[ .-]{1}[0-9]{4}" 125 numbers = [re.sub(r"[^0-9]", "", number) for number in re.findall(pattern, text)] 126 numbers = [ 127 number 128 for number in numbers 129 if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number)) 130 ] 131 return sorted(set(numbers))
Scrape phone numbers from text using regex.