-
Notifications
You must be signed in to change notification settings - Fork 3
/
1-parse-renewals.py
37 lines (31 loc) · 1.11 KB
/
1-parse-renewals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# This script converts each copyright renewal record from CSV to
# a JSON format similar to (but much simpler than) that created by
# 0-parse-registrations.py.
import json
from pdb import set_trace
import os
from csv import DictReader
from collections import defaultdict
from model import Renewal
class Parser(object):
def __init__(self):
self.count = 0
self.cross_references = defaultdict(list)
def process_directory_tree(self, path):
for i in os.listdir(path):
if not i.endswith('tsv'):
continue
if i == 'TOC.tsv':
continue
for entry in self.process_file(os.path.join(path, i)):
yield entry
self.count += 1
print(self.count)
def process_file(self, path):
for line in DictReader(open(path), dialect='excel-tab'):
yield Renewal.from_dict(line)
output = open("output/1-parsed-renewals.ndjson", "w")
parser = Parser()
for parsed in parser.process_directory_tree("renewals/data"):
json.dump(parsed.jsonable(), output)
output.write("\n")