-
Notifications
You must be signed in to change notification settings - Fork 12
/
generate.py
executable file
·413 lines (348 loc) · 13.6 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
#!/usr/bin/env python3
""" Outputs the width file to stdout. """
import datetime
import hashlib
import os.path
import re
import sys
from collections.abc import Iterable
from typing import NamedTuple
from urllib.request import urlretrieve
VERSION = "16.0.0"
UNICODE_DATA_URL = "https://unicode.org/Public/%s/ucd/UnicodeData.txt" % VERSION
EAW_URL = "https://unicode.org/Public/%s/ucd/EastAsianWidth.txt" % VERSION
EMOJI_DATA_URL = "https://unicode.org/Public/%s/ucd/emoji/emoji-data.txt" % VERSION
# A handful of field names
# See https://www.unicode.org/L2/L1999/UnicodeData.html
FIELD_CODEPOINT = 0
FIELD_NAME = 1
FIELD_CATEGORY = 2
# Category for unassigned codepoints.
CAT_UNASSIGNED = "Cn"
# Category for private use codepoints.
CAT_PRIVATE_USE = "Co"
# Category for surrogates.
CAT_SURROGATE = "Cs"
# Category for non-characters.
# Note this does not appear in UnicodeData.txt.
# See https://www.unicode.org/faq/private_use.html
CAT_NON_CHARACTERS = "non-characters"
# Maximum codepoint value.
MAX_CODEPOINT = 0x10FFFF
CPP_PREFIX = "widechar_"
# Ambiguous East Asian characters
WIDTH_AMBIGUOUS_EASTASIAN = -3
# Width changed from 1 to 2 in Unicode 9.0
WIDTH_WIDENED_IN_9 = -6
# Private use characters.
WIDTH_PRIVATE_USE = -7
class CodePoint(object): # pylint: disable=too-few-public-methods
"""Represents a single Unicode codepoint"""
def __init__(self, codepoint):
self.codepoint = codepoint
self.width = None
self.category = CAT_UNASSIGNED
def hex(self):
"""Return the codepoint as a hex string"""
return "0x%05X" % self.codepoint
# Settings controlling language output.
class LangSettings(NamedTuple):
range_chars: str # open/close characters for ranges, like "{}"
indentation: str = " "
keep_last: bool = False
# Data parsed from unicode.org datafiles.
# Datas are lists of lines, with comment-only lines removed.
# Hashes are sha1 strings.
class UnicodeDatas(NamedTuple):
unicode_data: list[str]
unicode_hash: str
eaw_data: list[str]
eaw_hash: str
emoji_data: list[str]
emoji_hash: str
def log(msg):
"""Logs a string to stderr"""
sys.stderr.write(str(msg) + "\n")
def read_datafile(url):
"""Download a file from url to name if not already present.
Return the file as a tuple (lines, sha1)
lines will have comment-only lines removed, sha1 is a string.
"""
name = url.rsplit("/", 1)[-1]
if not os.path.isfile(name):
log("Downloading " + name)
urlretrieve(url, name)
with open(name, "rb") as ofile:
data = ofile.read()
hashval = hashlib.sha1(data).hexdigest()
lines = data.decode("utf-8").split("\n")
lines = [line for line in lines if not line.startswith("#")]
return (lines, hashval)
def set_general_categories(unicode_data, cps):
"""Receives lines from UnicodeData.txt,
and sets general categories for codepoints."""
for line in unicode_data:
fields = line.strip().split(";")
if len(fields) > FIELD_CATEGORY:
for idx in hexrange_to_range(fields[FIELD_CODEPOINT]):
cps[idx].category = fields[FIELD_CATEGORY]
def merged_codepoints(cps: Iterable[CodePoint]):
"""return a list of codepoints (start, end) for inclusive ranges"""
cps = sorted(cps, key=lambda cp: cp.codepoint)
if not cps:
return []
ranges = [(cps[0], cps[0])]
for cp in cps[1:]:
last_range = ranges[-1]
if cp.codepoint == last_range[1].codepoint + 1:
ranges[-1] = (last_range[0], cp)
continue
ranges.append((cp, cp))
return ranges
def gen_seps(length, indentation, keep_last):
"""Yield separators for a table of given length"""
table_columns = 1
for idx in range(1, length + 1):
if idx == length:
yield "" if not keep_last else ","
elif idx % table_columns == 0:
yield ",\n" + indentation
else:
yield ", "
def codepoints_to_carray_str(settings: LangSettings, cps: Iterable[CodePoint]):
"""Given a list of codepoints, return a C array string representing their inclusive ranges."""
result = ""
ranges = merged_codepoints(cps)
seps = gen_seps(len(ranges), settings.indentation, settings.keep_last)
for (start, end) in ranges:
result += "%s%s, %s%s%s" % (
settings.range_chars[0],
start.hex(),
end.hex(),
settings.range_chars[1],
next(seps),
)
return result
def hexrange_to_range(hexrange):
"""Given a string like 1F300..1F320 representing an inclusive range,
return the range of codepoints.
If the string is like 1F321, return a range of just that element.
"""
fields = [int(val, 16) for val in hexrange.split("..")]
if len(fields) == 1:
fields += fields
return range(fields[0], fields[1] + 1)
def parse_eaw_line(eaw_line):
"""Return a list of tuples (codepoint, width) from an EAW line"""
# Remove hash.
line = eaw_line.split("#", 1)[0]
fields = line.strip().split(";")
if len(fields) != 2:
return []
cps, width_type = [x.strip() for x in fields]
# width_types:
# A: ambiguous, F: fullwidth, H: halfwidth,
# N: neutral, Na: east-asian Narrow
if width_type in ["A"]:
width = WIDTH_AMBIGUOUS_EASTASIAN
elif width_type in ["F", "W"]:
width = 2
else:
width = 1
return [(cp, width) for cp in hexrange_to_range(cps)]
def set_eaw_widths(eaw_data_lines, cps):
"""Read from EastAsianWidth.txt, set width values on the codepoints"""
for line in eaw_data_lines:
for (cp, width) in parse_eaw_line(line):
cps[cp].width = width
# Apply the following special cases:
# - The unassigned code points in the following blocks default to "W":
# CJK Unified Ideographs Extension A: U+3400..U+4DBF
# CJK Unified Ideographs: U+4E00..U+9FFF
# CJK Compatibility Ideographs: U+F900..U+FAFF
# - All undesignated code points in Planes 2 and 3, whether inside or
# outside of allocated blocks, default to "W":
# Plane 2: U+20000..U+2FFFD
# Plane 3: U+30000..U+3FFFD
wide_ranges = [
(0x3400, 0x4DBF),
(0x4E00, 0x9FFF),
(0xF900, 0xFAFF),
(0x20000, 0x2FFFD),
(0x30000, 0x3FFFD),
]
for wr in wide_ranges:
for cp in range(wr[0], wr[1] + 1):
if cps[cp].width is None:
cps[cp].width = 2
def parse_emoji_line(line):
"""Return a list {cp, version} for the line"""
# Example line: 0023 ; Emoji # 1.1 [1] (#) number sign
fields_comment = line.split("#", 1)
if len(fields_comment) != 2:
return []
fields, comment = fields_comment
cps, prop = fields.split(";")
prop = prop.strip()
version = 0.0
# Some code points are marked "reserved" and do not have a version "NA".
fmtre = re.search(r"^\s*E\d+\.\d+", comment)
version = float(fmtre.group(0).strip()[1:]) if fmtre else 0.0
return [(cp, version, prop) for cp in hexrange_to_range(cps)]
def set_emoji_widths(emoji_data_lines, cps):
"""Read from emoji-data.txt, set codepoint widths"""
for line in emoji_data_lines:
for (cp, version, prop) in parse_emoji_line(line):
# The Regional Indicators are special
if cp in range(0x1F1E6, 0x1F200):
continue
# We only care about emoji *presentation*.
# Other codepoints should be rendered as text by default,
# so their EAW width applies.
if prop == "Emoji_Presentation":
# If this emoji was introduced before Unicode 9, then it was widened in 9.
# The version we get here is the *Emoji* version.
# Before Unicode 11 this was different, Unicode 9 shipped with Emoji 3.0.
cps[cp].width = 2 if version >= 3.0 else WIDTH_WIDENED_IN_9
def set_hardcoded_ranges(cps):
"""Mark private use and surrogate codepoints"""
# Private use can be determined awkwardly from UnicodeData.txt,
# but we just hard-code them.
# We do not treat "private use high surrogate" as private use
# so as to match wcwidth9().
private_ranges = [(0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD)]
for (first, last) in private_ranges:
for idx in range(first, last + 1):
cps[idx].category = CAT_PRIVATE_USE
surrogate_ranges = [(0xD800, 0xDBFF), (0xDC00, 0xDFFF)]
for (first, last) in surrogate_ranges:
for idx in range(first, last + 1):
cps[idx].category = CAT_SURROGATE
# See "noncharacters" discussion at https://www.unicode.org/faq/private_use.html
# "Last two code points of each of the 16 supplementary planes" and also BMP (plane 0).
nonchar_ranges = [(0xFDD0, 0xFDEF)]
for plane in range(0, 16 + 1):
c = 0x10000 * plane + 0xFFFE
nonchar_ranges.append((c, c + 1))
for (first, last) in nonchar_ranges:
for idx in range(first, last + 1):
cps[idx].category = CAT_NON_CHARACTERS
def read_datas():
"""Read our three Unicode files, and return a UnicodeDatas."""
unicode_data, unicode_hash = read_datafile(UNICODE_DATA_URL)
eaw_data, eaw_hash = read_datafile(EAW_URL)
emoji_data, emoji_hash = read_datafile(EMOJI_DATA_URL)
return UnicodeDatas(
unicode_data, unicode_hash, eaw_data, eaw_hash, emoji_data, emoji_hash
)
def make_codepoints(datas: UnicodeDatas):
"""Given a UnicodeDatas, return a list of CodePoints."""
cps = [CodePoint(i) for i in range(MAX_CODEPOINT + 1)]
set_general_categories(datas.unicode_data, cps)
set_eaw_widths(datas.eaw_data, cps)
set_emoji_widths(datas.emoji_data, cps)
set_hardcoded_ranges(cps)
return cps
def make_fields(
datas: UnicodeDatas,
cps: list[CodePoint],
settings: LangSettings,
template_hash: str,
generate_hash: str,
filename,
):
"""Return a dictionary of fields, ready to be plugged into a template string."""
log("Thinking...")
def categories(cats):
"""Return a carray string of codepoints in any of the given categories."""
catset = set(cats)
matches = [cp for cp in cps if cp.category in catset]
return codepoints_to_carray_str(settings, matches)
def codepoints_with_width(width):
"""Return a carray string of codepoints with the given width."""
return codepoints_to_carray_str(
settings, (cp for cp in cps if cp.width == width)
)
# A carray string of ASCII codepoints."
ascii_codepoints = codepoints_to_carray_str(
settings, (cp for cp in cps if 0x20 <= cp.codepoint < 0x7F)
)
# A decomposed Hangul syllable is a grapheme that consists of up to three
# code points. The first code point has width 2. The rest consists of
# Jamo vowels and/or a trailing consonant, both of which have width 1.
# This means that clients who naïvely sum individual characters'
# wcwidth(), will compute string widths different from the intended width
# (2). Work around this by forcing width 0 for these characters. This
# matches glibc and others.
combiningletters = codepoints_to_carray_str(
settings,
(
cp
for cp in cps
if (
(cp.codepoint >= 0x1160 and cp.codepoint <= 0x11FF)
or (cp.codepoint >= 0xD7B0 and cp.codepoint <= 0xD7FF)
)
),
)
fields = {
"p": CPP_PREFIX,
"filename": filename,
"unicode_version": VERSION,
"generate_hash": generate_hash,
"template_hash": template_hash,
"unicode_hash": datas.unicode_hash,
"eaw_hash": datas.eaw_hash,
"emoji_hash": datas.emoji_hash,
"ascii": ascii_codepoints,
"private": categories([CAT_PRIVATE_USE]),
"noncharacters": categories([CAT_NON_CHARACTERS]),
"nonprint": categories(["Cc", "Cf", "Zl", "Zp", CAT_SURROGATE]),
"combining": categories(["Mn", "Mc", "Me"]),
"combiningletters": combiningletters,
"doublewide": codepoints_with_width(2),
"unassigned": categories([CAT_UNASSIGNED]),
"ambiguous": codepoints_with_width(WIDTH_AMBIGUOUS_EASTASIAN),
"widenedin9": codepoints_with_width(WIDTH_WIDENED_IN_9),
}
return fields
def gitobjecthash(data):
"""Generate the git object hash of a bit of data
like `git hash-object`
"""
h = hashlib.sha1()
h.update(b"blob %u\0" % len(data))
h.update(data)
return h.hexdigest()
if __name__ == "__main__":
with open(__file__, "rb") as oof:
data = oof.read()
generate_hash = gitobjecthash(data)
datas = read_datas()
cps = make_codepoints(datas)
langs = {
# Suffix: Braces, indentation, keep the last comma
".h": ("{}",),
"_c.h": ("{}",),
".js": ("[]",),
".py": ("()", " " * 2, True),
".rs": ("()",),
".java": ("{}", " " * 2),
}
for suffix, settings in langs.items():
with open("templates/template" + suffix) as templatefile:
template = templatefile.read()
template_hash = gitobjecthash(template.encode("utf-8"))
output = "widechar_width" + suffix
fields = make_fields(
datas,
cps,
LangSettings(*settings),
template_hash,
generate_hash,
output,
)
with open(output, "w") as fd:
fd.write(template.strip().format(**fields))
fd.write("\n")
log("Output " + output)