Skip to content

Commit 50f12e7

Browse files
committed
wip [skip ci]
1 parent 86bfc84 commit 50f12e7

File tree

1 file changed

+107
-33
lines changed

1 file changed

+107
-33
lines changed

gcc/rust/util/rust-unicode.cc

Lines changed: 107 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ namespace Rust {
99
typedef uint32_t codepoint_t;
1010
typedef std::vector<codepoint_t> string_t;
1111

12+
// These constants are used to compose and decompose of Hangul syllables.
13+
// See `Sample Code for Hangul Algorithms` in 3.1.2
14+
// unicode.org/versions/Unicode15.0.0/ch03.pdf
15+
const uint32_t S_BASE = 0xAC00;
16+
const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
17+
const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
18+
const uint32_t N_COUNT = V_COUNT * T_COUNT;
19+
const uint32_t S_COUNT = L_COUNT * N_COUNT;
20+
1221
template <std::size_t SIZE>
1322
int64_t
1423
binary_search_ranges (
@@ -115,10 +124,26 @@ recursive_decomp_cano (codepoint_t c, string_t &buf)
115124
string_t
116125
decomp_cano (string_t s)
117126
{
118-
// TODO: Algorithmic lookup for Hangul
119127
string_t buf;
120128
for (codepoint_t c : s)
121-
recursive_decomp_cano (c, buf);
129+
{
130+
uint32_t s_index = c - S_BASE;
131+
if (0 <= s_index && s_index < S_COUNT)
132+
{
133+
// decompose Hangul argorithmically
134+
uint32_t l = L_BASE + s_index / N_COUNT;
135+
uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
136+
uint32_t t = T_BASE + s_index % T_COUNT;
137+
buf.push_back (l);
138+
buf.push_back (v);
139+
if (t != T_BASE)
140+
buf.push_back (t);
141+
continue;
142+
}
143+
144+
// Current character is not hangul
145+
recursive_decomp_cano (c, buf);
146+
}
122147
return buf;
123148
}
124149

@@ -145,46 +170,95 @@ sort_cano (string_t &s)
145170
}
146171

147172
string_t
148-
recomp (string_t s)
173+
compose_hangul (string_t s)
149174
{
150-
// TODO: Algorithmic lookup for Hangul
151175
string_t buf;
152-
if (s.size () > 0)
176+
if (s.size () < 2)
177+
return s;
178+
179+
codepoint_t last = s[0];
180+
buf.push_back (last);
181+
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
153182
{
154-
int last_class = -1;
155-
// int starter_pos = 0; // Assume the first character is Starter. Correct?
156-
// int target_pos = 1;
157-
codepoint_t starter_ch = s[0];
158-
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
183+
codepoint_t ch = s[src_pos];
184+
185+
// L V => LV
186+
uint32_t l_index = last - L_BASE;
187+
if (0 <= l_index && l_index < L_COUNT)
159188
{
160-
// get current character
161-
codepoint_t ch = s[src_pos];
162-
int ch_class = lookup_cc (ch);
163-
tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
164-
if (composite.has_value () && last_class < ch_class)
189+
uint32_t v_index = ch - V_BASE;
190+
if (0 <= v_index && v_index < V_COUNT)
165191
{
166-
// ch can be composed
167-
buf.push_back (composite.value ());
168-
starter_ch = composite.value ();
192+
last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
193+
// pop L
194+
buf.push_back (last);
195+
continue;
169196
}
170-
else if (ch_class == 0)
171-
{
172-
// ch is Starter and cannot be composed.
173-
if (src_pos == 1)
174-
// FIXME: buggy?
175-
buf.push_back (starter_ch);
176-
// starter_pos = target_pos;
177-
starter_ch = ch;
178-
last_class = -1;
179-
buf.push_back (ch);
180-
}
181-
else
197+
}
198+
199+
// LV T => LVT
200+
uint32_t s_index = last - S_BASE;
201+
if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
202+
{
203+
uint32_t t_index = ch - T_BASE;
204+
if (0 <= t_index && t_index < T_COUNT)
182205
{
183-
// ch is not Starter.
184-
last_class = ch_class;
185-
buf.push_back (ch);
206+
last += t_index;
207+
buf[buf.size () - 1] = last;
208+
continue;
186209
}
187210
}
211+
last = ch;
212+
}
213+
return buf;
214+
}
215+
216+
string_t
217+
recomp (string_t s)
218+
{
219+
// compose hangul first
220+
s = compose_hangul (s);
221+
222+
string_t buf;
223+
if (s.size () < 2)
224+
return s;
225+
226+
int last_class = -1;
227+
// int starter_pos = 0; // Assume the first character is Starter. Correct?
228+
// int target_pos = 1;
229+
codepoint_t starter_ch = s[0];
230+
231+
// check if two current characters are L and V
232+
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
233+
{
234+
// get current character
235+
codepoint_t ch = s[src_pos];
236+
237+
int ch_class = lookup_cc (ch);
238+
tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
239+
if (composite.has_value () && last_class < ch_class)
240+
{
241+
// ch can be composed
242+
buf.push_back (composite.value ());
243+
starter_ch = composite.value ();
244+
}
245+
else if (ch_class == 0)
246+
{
247+
// ch is Starter and cannot be composed.
248+
if (src_pos == 1)
249+
// FIXME: buggy?
250+
buf.push_back (starter_ch);
251+
// starter_pos = target_pos;
252+
starter_ch = ch;
253+
last_class = -1;
254+
buf.push_back (ch);
255+
}
256+
else
257+
{
258+
// ch is not Starter.
259+
last_class = ch_class;
260+
buf.push_back (ch);
261+
}
188262
}
189263
return buf;
190264
}

0 commit comments

Comments
 (0)