-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_unicodedata.py
106 lines (83 loc) · 3.67 KB
/
read_unicodedata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from dataclasses import dataclass
from enum import IntFlag, auto
import typing
class FontVariantType(IntFlag):
NONE = 0
BOLD = auto()
DOUBLE_STRUCK = auto()
FRAKTUR = auto()
ITALIC = auto()
MATHEMATICAL = auto()
MONOSPACE = auto()
SANS_SERIF = auto()
SCRIPT = auto()
@dataclass
class CharacterFontVariant:
text: str
kind: FontVariantType
class ParsedUnicodeData:
def __init__(self):
self.subscript_mapping: dict[str, str] = {}
self.superscript_mapping: dict[str, str] = {}
self.font_variants: dict[str, list[CharacterFontVariant]] = {}
def read_datafile() -> ParsedUnicodeData:
result = ParsedUnicodeData()
# TODO: Fix finding superscript alpha, iota, epsilon
# Their fallbacks are listed as the "Latin" variants, meaning they aren't found
# when looking for ^{\alpha} as it looks for the Greek variants
with open("./UnicodeData.txt", encoding="utf-8") as f:
for line in f:
fields = line.split(";")
assert len(fields) == 15
codepoint = fields[0]
name = fields[1]
decomposition = fields[5]
char = chr(int(codepoint, 16))
if decomposition:
# Help out mypy with redefinitions
map_type: typing.Any
basechars: typing.Any
# print(f"{name} has decomposition {decomposition}")
*map_type, basechars = decomposition.split(maxsplit=1)
# We aren't looking for 2 -> 1 mappings, skip any that decompose to
# multiple characters.
basechars = basechars.split()
if len(basechars) > 1:
continue
basechar = chr(int(basechars[0], 16))
assert len(map_type) < 2
map_type = "".join(map_type)
if map_type == "<super>":
# Intentionally overwrite if there's multiple
# Later unicode values tend to look more consistent with one another
result.superscript_mapping[basechar] = char
elif map_type == "<sub>":
# Intentionally overwrite if there's multiple
result.subscript_mapping[basechar] = char
elif map_type == "<font>":
variant = CharacterFontVariant(
char,
FontVariantType.MATHEMATICAL * ("MATHEMATICAL" in name)
| FontVariantType.BOLD * ("BOLD" in name)
| FontVariantType.DOUBLE_STRUCK * ("DOUBLE-STRUCK" in name)
| FontVariantType.FRAKTUR * (any(x in name for x in ["FRAKTUR", "BLACK-LETTER"]))
| FontVariantType.ITALIC * ("ITALIC" in name)
| FontVariantType.MONOSPACE * ("MONOSPACE" in name)
| FontVariantType.SANS_SERIF * ("SANS-SERIF" in name)
| FontVariantType.SCRIPT * ("SCRIPT" in name)
)
result.font_variants.setdefault(basechar, []).append(variant)
# HACK: Fix up some missing mappings
result.superscript_mapping["α"] = "ᵅ"
result.superscript_mapping["ϵ"] = "ᵋ"
result.superscript_mapping["ι"] = "ᶥ"
result.superscript_mapping["ϕ"] = "ᶲ"
# Planck's constant already fulfills this role, but isn't detected because it was added before
# the Unicode standard had a group of mathematical variants
result.font_variants["h"].append(
CharacterFontVariant(
text="\u210E",
kind=FontVariantType.ITALIC | FontVariantType.MATHEMATICAL
)
)
return result