-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrape_sdcounty_status.py
106 lines (89 loc) · 2.77 KB
/
scrape_sdcounty_status.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import requests,html5lib,json
import boto3,datetime,sys,re
URL = "https://www.sandiegocounty.gov/content/sdc/hhsa/programs/phs/community_epidemiology/dc/2019-nCoV/status.html"
# PDT
date_str = (datetime.datetime.utcnow() - datetime.timedelta(hours=7)).strftime("%Y-%m-%d")
def get_county_html(url=None):
if not url:
url = URL
resp = requests.get(url)
html = html5lib.parse(resp.content,namespaceHTMLElements=False)
return html
def parse_out_table(doc):
data = []
k = None
for tr in doc.findall('.//div/table')[0].findall('.//tr'):
r = [ (e.findall('.//b') and e.findall('.//b')[0].text or e.text)for e in tr.findall('.//td')]
data.append(r)
result = {}
for r in data:
if len(r) <= 1: continue
k = r[0]
v = r[1].strip()
if not v: continue
if len(r) == 4:
v = r[3]
try:
v = int(v)
except ValueError:
pass
#print("%s = %s" % (k,v))
result[k] = v
result["updated"] = date_str
return result
if __name__=="__main__":
if len(sys.argv) > 1:
url = sys.argv[1]
else: url = None
htmldoc = get_county_html(url)
data = parse_out_table(htmldoc)
print( json.dumps(data,indent=1) )
def lambda_handler(event, context):
s3_client = boto3.client('s3')
path = 'data/corona-sd/coronavirus-latest.json'
dated_path = 'data/corona-sd/%s-corona-sd.json'
bucket = 'opensandiego-data'
# Get Data
htmldoc = get_county_html()
data = parse_out_table(htmldoc)
# Generate index
# TODO
result = s3_client.list_objects_v2(
Bucket = bucket,
Prefix = 'data/corona-sd/',
)
# Generate listing with date stamp
listing = []
regex = re.compile(r'data\/corona-sd\/(\d+)-(\d+)-(\d+)-corona-sd.json')
for d in result['Contents']:
m = regex.match(d['Key'])
if m:
datum = json.loads(s3_client.get_object(
Bucket = bucket,
Key = d['Key'],
)['Body'].read())
listing.append( datum )
listing.append( data )
# Update full data feed first
s3_client.put_object(
Body = json.dumps(listing,indent=1),
Bucket=bucket,
Key='data/corona-sd/data.json',
ContentType="application/json",
ACL='public-read',
)
# Then put latest snapshot for record keeping
s3_client.put_object(
Body = json.dumps(data,indent=1),
Bucket=bucket,
Key=path,
ContentType="application/json",
ACL='public-read',
)
s3_client.put_object(
Body = json.dumps(data,indent=1),
Bucket=bucket,
ContentType="application/json",
Key=dated_path % date_str,
ACL='public-read',
)