-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrape_sd_county_daily_summary.py
107 lines (82 loc) · 3.46 KB
/
scrape_sd_county_daily_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
'''
Code downloads pdf files from SD County website and converts information to
json output
Website: https://www.sandiegocounty.gov/content/dam/sdc/hhsa/programs/phs/Epidemiology/COVID-19%20Daily%20Update_City%20of%20Residence.pdf
'''
import requests
from datetime import date, timedelta
import tabula
import pandas as pd
import re
import json
#For testing
import os
#Download PDF locally
def download_pdf(filename,URL):
myfile = requests.get(URL)
open(filename,'wb').write(myfile.content)
#Read pdf file and
def convert_pdf(filename,yesterdate):
pdf_df = tabula.read_pdf(filename, pages = "all", multiple_tables = False)
pdf_list = pdf_df.values.tolist()
#Extract data
df = pd.DataFrame(columns=["date","city","confirmed_cases","percent_of_total(%)"])
json_data = {}
json_data[yesterdate] = []
#Determine where data index begins and ends
start_index = 0
end_index = 0
for i, plist in enumerate(pdf_list):
if('incorporated city' in plist[0].lower()):
start_index = i + 1
elif('total san diego' in plist[0].lower()):
end_index = i + 1
break
#Writing data dataframe and json
for line in pdf_list[start_index:end_index]:
if(len(line) == 1):
tmp_list = re.split('(\d+)',line[0].lower())
if('incorporated' not in tmp_list[0]):
df = df.append({"date":yesterdate,\
"city":tmp_list[0],\
"confirmed_cases":tmp_list[1],\
"percent_of_total(%)":"".join(tmp_list[3:-1])},\
ignore_index = True)
json_data[yesterdate].append({"city":tmp_list[0],\
"confirmed_cases":tmp_list[1],\
"percent_of_total(%)":"".join(tmp_list[3:-1])})
else:
tmp_list = line
if('incorporated' not in tmp_list[0]):
df = df.append({"date":yesterdate,\
"city":tmp_list[0],\
"confirmed_cases":tmp_list[1].split(" ")[0],\
"percent_of_total(%)":tmp_list[1].split(" ")[1]},\
ignore_index = True)
json_data[yesterdate].append({"city":tmp_list[0],\
"confirmed_cases":tmp_list[1],\
"percent_of_total(%)":"".join(tmp_list[3:-1])})
return df, json_data
if __name__=="__main__":
yesterdate = str(date.today() - timedelta(days=1))
file = "sd_daily_update_city_" + yesterdate + ".pdf"
filepath = "sd_daily_city_pdfs/"
filename = filepath + file
URL = "https://www.sandiegocounty.gov/content/dam/sdc/hhsa/programs/" + \
"phs/Epidemiology/COVID-19%20Daily%20Update_City%20of%20Residence.pdf"
#Downloading and converting data
download_pdf(filename,URL)
df, json_data = convert_pdf(filename, yesterdate)
#Writing data
csv_file = 'sd_daily_city_summary.csv'
json_file = 'sd_daily_city_summary.json'
csv_mode = 'a'
csv_header = False
if(not os.path.exists(csv_file)):
csv_mode = 'w'
csv_header = True
df.to_csv(csv_file,mode=csv_mode,header=csv_header,\
index=False)
with open(json_file,'a') as file:
json.dump(json_data,file)