-
Notifications
You must be signed in to change notification settings - Fork 9
/
metadata_utils.py
77 lines (61 loc) · 2.62 KB
/
metadata_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json, re, bs4
import domain_utils, web_utils, publicsuffix2
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
def get_page_text(source):
soup = bs4.BeautifulSoup(source, 'html.parser')
for script in soup(["script", "style"]): script.decompose() # extract and kill all script and style elements
text = soup.get_text() # get text
lines = (line.strip() for line in text.splitlines()) # break into lines and remove leading and trailing space on each
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # break multi-headlines into a line each
text = '\n'.join(chunk for chunk in chunks if chunk) # drop blank lines
return text
def get_page_metadata(url):
redirect_chain = [url]
current_url = url
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument("--window-size=1280,720")
options.add_argument('--disable-gpu')
service = Service(ChromeDriverManager(version='114.0.5735.90').install())
driver = webdriver.Chrome (service=service, options=options)
# 1. Get redirection path
while True:
driver.get(current_url)
current_url = driver.current_url
if current_url in redirect_chain:
break
redirect_chain.append(current_url)
# 1a. Read all text on last page
source = driver.page_source
page_text = driver.title + get_page_text(driver.page_source)
driver.quit()
# 2. Get emails on page
email_regex = r'''([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)'''
emails = list(set(re.findall (email_regex, page_text)))
# 3. Get phone numbers on page
phone_regex = r'''(?:\+?\d[-() \d]*){8,12}'''
phone_numbers = list(set(re.findall (phone_regex, page_text)))
# 4. Get urls on page
urls = web_utils.get_links(source)
# 5. Purchasable?
whois = domain_utils.is_buyable_whois(url)
godaddy = domain_utils.is_buyable_godaddy(url)
private = domain_utils.is_buyable_privately(page_text)
redirects_to_target = False
try:
if domain_utils.get_domain_from_url(url) in domain_utils.get_domain_from_url(redirect_chain[-1]): redirects_to_target = True
except: pass
results = {
"target" : url,
"redirect_chain" : redirect_chain,
"redirects_to_target" : redirects_to_target,
"emails" : emails,
"phone_numbers" : phone_numbers,
"urls" : urls,
"available_whois" : whois,
"available_godaddy" : godaddy,
"available_privately" : private,
}
return results