forked from mtjonakon/codefinger
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sec_alj-scrape.py
57 lines (41 loc) · 1.06 KB
/
sec_alj-scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# this script is to extract the hyperlinks to the 2015 ALJ Orders
# from the SEC website (http://www.sec.gov/alj/aljorders.shtml)
# packages
import re
import os
import time
import urllib2
import requests
import pandas as pd
from bs4 import BeautifulSoup
# setting the url for job search on craigslist
url = "http://www.sec.gov/alj/aljorders.shtml"
# alj orders
r = requests.get(url)
# parsing the page source into usable data
soup = BeautifulSoup(r.content)
# get the datafiles
links = []
for link in soup.find_all('a'):
print(link.get('href'))
links.append(str(link.get('href')))
# check the number of links found
print len(links)
# drop all non-.pdf links
pdf_links = []
for link in links:
if re.search(r'\.pdf',link):
pdf_links.append(link)
print len(pdf_links)
#use the list with links to download the pdf files
i = 0
for link in pdf_links:
i+=1
file_link = "http://www.sec.gov" + link
print file_link
rq = urllib2.Request(file_link)
res = urllib2.urlopen(rq)
pdf = open("link%d.pdf" %i, 'wb')
pdf.write(res.read())
pdf.close()
time.sleep(1)