-
Notifications
You must be signed in to change notification settings - Fork 0
/
Webscraper-R01
124 lines (100 loc) · 5.53 KB
/
Webscraper-R01
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# 0. Help Section ############################################################################
""" Info: This code scrapes marketscreener.com for press releases referred to in 'inputfilename'
and outputs the results to scraped_data_XXX.xlsx
Prerequisites for this code to work
1. python 3
2. selenium (pip install)
3. BeautifulSoup (pip install)
4. pandas (pip install) (or with anaconda)
5. chromedriver.exe (chromedriver.exe - can be installed from https://chromedriver.storage.googleapis.com/index.html?path=85.0.4183.87/)
Author: Daniel Enriquez
Release History:
00 - 08/2020
"""
############################################################################
#1. Import Section ############################################################
############################################################################
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import csv
import pdb #for debugging
from datetime import datetime
from openpyxl import load_workbook
from re import search
time = datetime.now().strftime("%Y%m%d-%H%M")
############################################################################
#2. User Inputs ############################################################
############################################################################
inputfilename = 'predictions/tickers_only_20200902-1112.csv' # csv containing stocks to watch with one header row
outfilename = 'predictions/scraped_data_'+time+'.xlsx' #output filename
############################################################################
#3. Import CSV #############################################################
############################################################################
weblinkmain = "https://www.marketscreener.com/" # do not change this
# best to put chromedriver a env var PATH folder. I put mine in 'Scripts'
capa = DesiredCapabilities.FIREFOX
capa["pageLoadStrategy"] = "none"
driver = webdriver.Firefox(executable_path=r"C:\Users\J63434\Desktop\Personal Projects\surpriver-master\geckodriver.exe")
writer = pd.ExcelWriter(outfilename)
stockstoscrape = open(inputfilename, newline='')
#headers = next(stockstoscrape, None)
############################################################################
#6. Define functions #######################################################
############################################################################
def waitwebdriver(driver, string) :
try:
wait = WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, string)) ) # wait for page to load
except:
driver.refresh()
wait = WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, string)) ) # wait for page to load
############################################################################
#5. Main Scraper ###########################################################
############################################################################
for cnt, symbol in enumerate(stockstoscrape): # search for each stock in csv stockstoscrape
if len(symbol) > 3 :
ind = search(',',symbol).start()
symbol = symbol[0:ind]
# intialization of df
presstitles = [] # List for PR
presslinks = [] # hyper links to the PR
symbols = [] #ticket symbols
dates = []
driver.get(weblinkmain) #navigate to landing page
wait = WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.CLASS_NAME, 'inputrecherche')) )
driver.execute_script("window.stop();") # stop loading when element found
srch_box = driver.find_element_by_class_name('inputrecherche') #find search box , specific to website
print('Searching for '+symbol)
srch_box.send_keys(symbol) #search symbol
srch_box.send_keys(Keys.RETURN)
waitwebdriver(driver, 'More Instruments')
linkstring = "//a[contains(@href, 'quote/stock')]"
firstlink = driver.find_element_by_xpath(linkstring) #will click first link it finds with linkstring
driver.get(firstlink.get_attribute('href'))
waitwebdriver(driver, 'More about the company' )
soup = BeautifulSoup(driver.page_source, 'lxml')
links = soup.findAll('td', attrs={'class':"newsColCT ptop3 pbottom3 pleft5"}) # extract all row info from PR table
for link in links:
hyperlink = link.find('a')['href'] #find hyperlink
texts = link.text # find text title
date = link.find('a')['title'] #find str containing date
date.replace('News Stock market from', ' ') #clean up string
date = search('[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9]', date) #clean up string
#save stuff in lists
presslinks.append(weblinkmain+hyperlink)
presstitles.append(texts)
dates.append(date.group(0))
#end links loop
#save stuff in lists
exec('df'+str(cnt)+' = pd.DataFrame({"Date":dates, "PR Titles":presstitles, "PR Links":presslinks})') # save df
exec('df'+str(cnt)+'.to_excel(writer, symbol)' ) #write df
else:
pass
#end WL loop
writer.save()
writer.close()