-
Notifications
You must be signed in to change notification settings - Fork 8
/
myparser.py
107 lines (93 loc) · 3.09 KB
/
myparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import string
import re
class parser:
def __init__(self,results,word=""):
self.results=results
self.word=word
self.temp=[]
self.file=file
def genericClean(self):
self.results = re.sub('<em>', '', self.results)
self.results = re.sub('<b>', '', self.results)
self.results = re.sub('</b>', '', self.results)
self.results = re.sub('</em>', '', self.results)
self.results = re.sub('%2f', ' ', self.results)
self.results = re.sub('%3a', ' ', self.results)
self.results = re.sub('<strong>', '', self.results)
self.results = re.sub('</strong>', '', self.results)
self.results = re.sub('<w:t>',' ',self.results)
for e in ('>',':','=', '<', '/', '\\',';','&','%3A','%3D','%3C'):
self.results = string.replace(self.results, e, ' ')
def urlClean(self):
self.results = re.sub('<em>', '', self.results)
self.results = re.sub('</em>', '', self.results)
self.results = re.sub('%2f', ' ', self.results)
self.results = re.sub('%3a', ' ', self.results)
for e in ('<','>',':','=',';','&','%3A','%3D','%3C'):
self.results = string.replace(self.results, e, ' ')
def emails(self):
self.genericClean()
reg_emails = re.compile('[a-zA-Z0-9.-_]+' + '@' + '[a-zA-Z0-9.-]+')
self.temp = reg_emails.findall(self.results)
emails=self.unique()
return emails
def fileurls(self):
urls=[]
reg_urls = re.compile('<a href="(.*?)"')
self.temp = reg_urls.findall(self.results)
allurls=self.unique()
for z in allurls:
y = string.replace(z,'/url?q=','')
x = y.split('&')[0]
if x.count('webcache') or x.count('google.com') or x.count('search?') or x.count('about.html') or x.count('privacy.html') or x.count('ads/') or x.count('services/') or x == "#" or x=="/":
pass
else:
urls.append(x)
return urls
def people_linkedin(self):
reg_people = re.compile('">[a-zA-Z0-9._ -]* profiles | LinkedIn')
self.temp = reg_people.findall(self.results)
resul = []
for x in self.temp:
y = string.replace(x, ' LinkedIn', '')
y = string.replace(y, ' profiles ', '')
y = string.replace(y, 'LinkedIn', '')
y = string.replace(y, '"', '')
y = string.replace(y, '>', '')
if y !=" ":
resul.append(y)
return resul
def profiles(self):
reg_people = re.compile('">[a-zA-Z0-9._ -]* - <em>Google Profile</em>')
self.temp = reg_people.findall(self.results)
resul = []
for x in self.temp:
y = string.replace(x, ' <em>Google Profile</em>', '')
y = string.replace(y, '-', '')
y = string.replace(y, '">', '')
if y !=" ":
resul.append(y)
return resul
def hostnames(self):
self.genericClean()
reg_hosts = re.compile('[a-zA-Z0-9.-]*\.'+ self.word)
self.temp = reg_hosts.findall(self.results)
hosts=self.unique()
return hosts
def hostnames_all(self):
reg_hosts = re.compile('<cite>(.*?)</cite>')
temp = reg_hosts.findall(self.results)
for x in temp:
if x.count(':'):
res=x.split(':')[1].split('/')[2]
else:
res=x.split("/")[0]
self.temp.append(res)
hostnames=self.unique()
return hostnames
def unique(self):
self.new=[]
for x in self.temp:
if x not in self.new:
self.new.append(x)
return self.new