-
Notifications
You must be signed in to change notification settings - Fork 31
/
readme2book.py
248 lines (217 loc) · 8.22 KB
/
readme2book.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# 1. ingest readme
# 2. Iterate of sections. We will use these to create folder paths
# 3. Iterate over entries. We will create a document for each entry using the stub template.
# It'd be neat and fancy to create this as a jinja template, but that's like way overkill. At least for now.
# - Create a TOC section for each top level bullet. Nest sub-docs beneath this. Some housekeeping may be necessary
# - Create a MyST document for each reference. Let's use this pattern: YYYY_FirstAuthorFullName_arxivIDorDOIifAvailable.
# ....
# Actually, that pattern doesn't work since not everything is on arxiv or even has a link.
# Let's go with this instead: YYYY_FirstAuthorFullName_FirstKCharactersOfArticleTitle
#
#
# - In retrospect, if there's a way I can convert the collection I have into bibtex entries, that would be a better
# data structure to parse for this. Oh well. Later. Bibtex + jinja. Noted.
from loguru import logger
from collections import defaultdict
from unidecode import unidecode
import string
from pathlib import Path
class Book:
def __init__(self,
readme_fpath='README.md',
outpath='content',
toc_path='_toc.md',
):
self.outpath = outpath
self.parser = ReadmeParser(fpath=readme_fpath)
def generate_stubs(self):
self.stubs = []
for topic in self.parser.entries:
if 'subheadings' not in topic:
continue
for subtopic, entries in topic['subheadings'].items():
for entry in entries:
stub = Stub(entry, depth=3)
self.stubs.append(stub)
# write stub to disk
stub.write(self.outpath)
# map stub to TOC entry (or write TOC entry?)
def add_stubs_to_toc(self):
pass
class Stub:
_header = """---
jupytext:
formats: md:myst
text_representation:
extension: .md
format_name: myst
kernelspec:
display_name: Python 3
language: python
name: python3
---"""
_pdf_embed_template = """```{{code-cell}} ipython3
:tags: [hide-input]
import panel as pn
pn.extension()
pdf_pane = pn.pane.PDF('{pdf_url}', width=700, height=1000)
pdf_pane
```"""
def __init__(self, item, depth=1):
self.item = item
self.depth=depth
@property
def title(self):
h = '#'*self.depth
return f"{h} {self.item['title_part']}"
@property
def pdf_embed(self):
return self._pdf_embed_template.format(pdf_url=self.item['url'])
def __str__(self):
parts = [self._header]
parts.append(self.title)
url = self.item.get('url')
#if self.item.get('is_pdf'):
# url = self.pdf_embed
if url:
parts.append(url)
return '\n\n'.join(parts)
def write(self, fpath='.', ext='.md'):
outpath = Path(fpath) / Path(self.item['stub_name'] + ext)
with open(outpath, 'w') as f:
f.write(str(self))
class ReadmeParser:
def __init__(self, fpath='README.md'):
self.fpath = fpath
self.read()
self.entries = self.parse()
def read(self):
with open(self.fpath, 'rb') as f:
text = f.read()
self.text = text.decode("utf-8")
def parse(self):
self.errors = []
entries = self.parse_sections()
entries = self.parse_entries(entries)
entries = self.parse_subheadings(entries)
entries = self.parse_items(entries)
return entries
def parse_sections(self):
sections = []
section = []
for i, line in enumerate(self.text.split('\n')):
if line.startswith('# '):
if section:
sections.append(section)
section = []
section.append(line)
return sections
def parse_entries(self, sections):
entries = []
for i, sec in enumerate(sections):
if i < 1:
continue
entry = {}
entry['heading'], entry['rest'] = sec[0][2:].strip(), sec[1:]
entries.append(entry)
return entries
def parse_subheadings(self, entries):
subheading = ""
for entry in entries:
subheadings = defaultdict(list)
for line in entry['rest']:
if line.startswith('* '):
subheading = line.strip()[2:]
continue
if subheading:
line = line.strip()
if line:
subheadings[subheading].append(line)
entry.pop('rest')
if subheadings:
entry['subheadings'] = subheadings
return entries
def parse_year(self, item):
rec = {}
parts = item.split('-')
rec['year_part'], rec['rest'] = parts[0], parts[1:]
rec['year'] = int(rec['year_part'][1:].strip())
rec.pop('year_part')
return rec
def parse_title(self, rec):
parts = '-'.join(rec['rest']).split('](')
if len(parts) <2:
# continue # hmmm......
raise ValueError # this part wasn't writing into an error before. should it be now?
if len(parts) > 2:
#errors.append(item)
#continue
raise ValueError # I guess?
rec['title_part'], rec['rest'] = parts
rec['title_part'] = rec['title_part'].strip()[1:] # remove starting bracket. need to standardize/remove quotes
return rec
def parse_url_and_authors(self, rec):
parts = rec['rest'].split(') -')
if len(parts) <2:
#continue
raise ValueError # I guess?
if len(parts) > 2:
#errors.append(item)
#continue
raise ValueError # I guess?
rec['url'], rec['authors_part'] = parts
rec.pop('rest')
if rec['url'].endswith('.pdf'):
rec['is_pdf'] = 'True'
rec['authors'] = [a.strip() for a in rec['authors_part'].split(',')]
rec.pop('authors_part')
return rec
def parse_items(self, entries):
errors = self.errors
for entry in entries:
if 'subheadings' not in entry:
continue # actually I want to remove these, but deal with it later
for topic, items in entry['subheadings'].items():
recs = []
for item in items:
try:
rec = self.parse_year(item)
rec = self.parse_title(rec)
rec = self.parse_url_and_authors(rec)
rec['stub_name'] = self.generate_fname(rec)
except ValueError:
logger.warning(item)
errors.append(item)
recs.append(rec)
if recs:
entry['subheadings'][topic] = recs
return entries
def generate_fname(self, entry, k_title_chars=30):
"""YYYY_FirstAuthorFullName_FirstKCharactersOfArticleTitle"""
def clean(text):
s = text.title().replace(' ', '')
# https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
s = s.translate(str.maketrans('', '', string.punctuation))
return unidecode(s)
author = entry['authors'][0]
author_clean = clean(author)
title_clean = clean(entry['title_part'])
return f"{entry['year']}_{author_clean}_{title_clean[:k_title_chars]}"
# to do:
# - generate stubs from items
# - generate TOC entries
if __name__ == '__main__':
parser = ReadmeParser()
stub_path='anthology_of_modern_ml/stubs'
for entry in parser.entries:
if 'subheadings' not in entry:
continue
for subheading, items in entry['subheadings'].items():
logger.debug(f'subheading: {subheading}')
for item in items:
logger.debug(f'item:\n {item}')
stub = Stub(item)
try:
stub.write(stub_path)
except Exception as e:
logger.warning(e)