-
Notifications
You must be signed in to change notification settings - Fork 1
/
unjson
executable file
·36 lines (32 loc) · 1.08 KB
/
unjson
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env python3
#
# Unpacks a JSON file exported from BigQuery httparchive table.
import json
import os
import sys
def write(data, outpath):
page = data['page']
url = data['url']
body = data['body']
with open(outpath, mode='wt', encoding='utf-8') as f:
f.write(body)
f.write('\n<!--\n')
f.write('page: %s\n' % page)
f.write('url: %s\n' % url)
f.write('-->\n')
if __name__ == '__main__':
for path in sys.argv[1:]:
f = open(path, mode='rt', encoding='utf-8')
outbase = os.path.splitext(path)[0]
try:
# first try to parse the whole file as JSON
data = json.load(f)
write(data, outbase + '.txt')
except:
# otherwise try to split by lines
f.seek(0)
linedata = [json.loads(data) for data in f.readlines()]
width = len(str(len(linedata)))
for line, data in enumerate(linedata):
number = str(line + 1).rjust(width, '0')
write(data, outbase + '.' + number + '.txt')