@@ -9,6 +9,15 @@ namespace Rust {
9
9
typedef uint32_t codepoint_t ;
10
10
typedef std::vector<codepoint_t > string_t ;
11
11
12
+ // These constants are used to compose and decompose of Hangul syllables.
13
+ // See `Sample Code for Hangul Algorithms` in 3.1.2
14
+ // unicode.org/versions/Unicode15.0.0/ch03.pdf
15
+ const uint32_t S_BASE = 0xAC00 ;
16
+ const uint32_t L_BASE = 0x1100 , V_BASE = 0x1161 , T_BASE = 0x11A7 ;
17
+ const uint32_t L_COUNT = 19 , V_COUNT = 21 , T_COUNT = 28 ;
18
+ const uint32_t N_COUNT = V_COUNT * T_COUNT;
19
+ const uint32_t S_COUNT = L_COUNT * N_COUNT;
20
+
12
21
template <std::size_t SIZE>
13
22
int64_t
14
23
binary_search_ranges (
@@ -115,10 +124,26 @@ recursive_decomp_cano (codepoint_t c, string_t &buf)
115
124
string_t
116
125
decomp_cano (string_t s)
117
126
{
118
- // TODO: Algorithmic lookup for Hangul
119
127
string_t buf;
120
128
for (codepoint_t c : s)
121
- recursive_decomp_cano (c, buf);
129
+ {
130
+ uint32_t s_index = c - S_BASE;
131
+ if (0 <= s_index && s_index < S_COUNT)
132
+ {
133
+ // decompose Hangul argorithmically
134
+ uint32_t l = L_BASE + s_index / N_COUNT;
135
+ uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
136
+ uint32_t t = T_BASE + s_index % T_COUNT;
137
+ buf.push_back (l);
138
+ buf.push_back (v);
139
+ if (t != T_BASE)
140
+ buf.push_back (t);
141
+ continue ;
142
+ }
143
+
144
+ // Current character is not hangul
145
+ recursive_decomp_cano (c, buf);
146
+ }
122
147
return buf;
123
148
}
124
149
@@ -145,46 +170,95 @@ sort_cano (string_t &s)
145
170
}
146
171
147
172
string_t
148
- recomp (string_t s)
173
+ compose_hangul (string_t s)
149
174
{
150
- // TODO: Algorithmic lookup for Hangul
151
175
string_t buf;
152
- if (s.size () > 0 )
176
+ if (s.size () < 2 )
177
+ return s;
178
+
179
+ codepoint_t last = s[0 ];
180
+ buf.push_back (last);
181
+ for (unsigned int src_pos = 1 ; src_pos < s.size (); src_pos++)
153
182
{
154
- int last_class = - 1 ;
155
- // int starter_pos = 0; // Assume the first character is Starter. Correct?
156
- // int target_pos = 1;
157
- codepoint_t starter_ch = s[ 0 ] ;
158
- for ( unsigned int src_pos = 1 ; src_pos < s. size (); src_pos++ )
183
+ codepoint_t ch = s[src_pos] ;
184
+
185
+ // L V => LV
186
+ uint32_t l_index = last - L_BASE ;
187
+ if ( 0 <= l_index && l_index < L_COUNT )
159
188
{
160
- // get current character
161
- codepoint_t ch = s[src_pos];
162
- int ch_class = lookup_cc (ch);
163
- tl::optional<codepoint_t > composite = lookup_recomp (starter_ch, ch);
164
- if (composite.has_value () && last_class < ch_class)
189
+ uint32_t v_index = ch - V_BASE;
190
+ if (0 <= v_index && v_index < V_COUNT)
165
191
{
166
- // ch can be composed
167
- buf.push_back (composite.value ());
168
- starter_ch = composite.value ();
192
+ last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
193
+ // pop L
194
+ buf.push_back (last);
195
+ continue ;
169
196
}
170
- else if (ch_class == 0 )
171
- {
172
- // ch is Starter and cannot be composed.
173
- if (src_pos == 1 )
174
- // FIXME: buggy?
175
- buf.push_back (starter_ch);
176
- // starter_pos = target_pos;
177
- starter_ch = ch;
178
- last_class = -1 ;
179
- buf.push_back (ch);
180
- }
181
- else
197
+ }
198
+
199
+ // LV T => LVT
200
+ uint32_t s_index = last - S_BASE;
201
+ if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0 )
202
+ {
203
+ uint32_t t_index = ch - T_BASE;
204
+ if (0 <= t_index && t_index < T_COUNT)
182
205
{
183
- // ch is not Starter.
184
- last_class = ch_class ;
185
- buf. push_back (ch) ;
206
+ last += t_index;
207
+ buf[buf. size () - 1 ] = last ;
208
+ continue ;
186
209
}
187
210
}
211
+ last = ch;
212
+ }
213
+ return buf;
214
+ }
215
+
216
+ string_t
217
+ recomp (string_t s)
218
+ {
219
+ // compose hangul first
220
+ s = compose_hangul (s);
221
+
222
+ string_t buf;
223
+ if (s.size () < 2 )
224
+ return s;
225
+
226
+ int last_class = -1 ;
227
+ // int starter_pos = 0; // Assume the first character is Starter. Correct?
228
+ // int target_pos = 1;
229
+ codepoint_t starter_ch = s[0 ];
230
+
231
+ // check if two current characters are L and V
232
+ for (unsigned int src_pos = 1 ; src_pos < s.size (); src_pos++)
233
+ {
234
+ // get current character
235
+ codepoint_t ch = s[src_pos];
236
+
237
+ int ch_class = lookup_cc (ch);
238
+ tl::optional<codepoint_t > composite = lookup_recomp (starter_ch, ch);
239
+ if (composite.has_value () && last_class < ch_class)
240
+ {
241
+ // ch can be composed
242
+ buf.push_back (composite.value ());
243
+ starter_ch = composite.value ();
244
+ }
245
+ else if (ch_class == 0 )
246
+ {
247
+ // ch is Starter and cannot be composed.
248
+ if (src_pos == 1 )
249
+ // FIXME: buggy?
250
+ buf.push_back (starter_ch);
251
+ // starter_pos = target_pos;
252
+ starter_ch = ch;
253
+ last_class = -1 ;
254
+ buf.push_back (ch);
255
+ }
256
+ else
257
+ {
258
+ // ch is not Starter.
259
+ last_class = ch_class;
260
+ buf.push_back (ch);
261
+ }
188
262
}
189
263
return buf;
190
264
}
0 commit comments