Skip to content

Commit 19c8107

Browse files
author
Zibi Braniecki
committed
Add unicode escaping in resolver
1 parent a956c47 commit 19c8107

File tree

9 files changed

+127
-74
lines changed

9 files changed

+127
-74
lines changed

fluent-bundle/benches/resolver.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ fn get_ids(res: &FluentResource) -> Vec<String> {
3939
}
4040

4141
fn resolver_bench(c: &mut Criterion) {
42-
let tests = &["simple", "menubar"];
42+
let tests = &["simple", "menubar", "unescape"];
4343
let ftl_strings = get_strings(tests);
4444

4545
c.bench_function_over_inputs(

fluent-bundle/benches/unescape.ftl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
face-with-tears-of-joy = 😂
2+
tetragram-for-centre = 𝌆
3+
4+
surrogates-in-text = \uD83D\uDE02
5+
surrogates-in-string = {"\uD83D\uDE02"}
6+
surrogates-in-adjacent-strings = {"\uD83D"}{"\uDE02"}
7+
8+
emoji-in-text = A face 😂 with tears of joy.
9+
emoji-in-string = {"A face 😂 with tears of joy."}

fluent-bundle/src/resolve.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use super::bundle::FluentBundle;
1515
use super::entry::GetEntry;
1616
use super::types::FluentValue;
1717
use fluent_syntax::ast;
18+
use fluent_syntax::unicode::unescape_unicode;
1819

1920
#[derive(Debug, PartialEq)]
2021
pub enum ResolverError {
@@ -176,8 +177,11 @@ impl<'source> ResolveValue for ast::InlineExpression<'source> {
176177
fn to_value(&self, env: &Env) -> Result<FluentValue, ResolverError> {
177178
match self {
178179
ast::InlineExpression::StringLiteral { raw } => {
179-
// XXX: We need to decode the raw into unicode here.
180-
Ok(FluentValue::from(*raw))
180+
if let Some(unescaped_value) = unescape_unicode(raw) {
181+
Ok(FluentValue::from(unescaped_value))
182+
} else {
183+
Ok(FluentValue::from(*raw))
184+
}
181185
}
182186
ast::InlineExpression::NumberLiteral { value } => {
183187
Ok(FluentValue::as_number(*value).unwrap())

fluent-syntax/benches/parser.rs

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use std::io;
77
use std::io::Read;
88

99
use fluent_syntax::parser::parse;
10+
use fluent_syntax::unicode::unescape_unicode;
1011

1112
fn read_file(path: &str) -> Result<String, io::Error> {
1213
let mut f = File::open(path)?;
@@ -38,5 +39,32 @@ fn parser_bench(c: &mut Criterion) {
3839
);
3940
}
4041

41-
criterion_group!(benches, parser_bench);
42+
fn unicode_unescape_bench(c: &mut Criterion) {
43+
let strings = &[
44+
"foo",
45+
"This is an example value",
46+
"Hello \\u00e3\\u00e9 World",
47+
"\\u004c\\u006f\\u0072\\u0065\\u006d \\u0069\\u0070\\u0073\\u0075\\u006d \\u0064\\u006f\\u006c\\u006f\\u0072 \\u0073\\u0069\\u0074 \\u0061\\u006d\\u0065\\u0074",
48+
"Let me introduce \\\"The\\\" Fluent",
49+
"And here's an example of \\\\ a character to be escaped",
50+
"But this message is completely unescape free",
51+
"And so is this one",
52+
"Maybe this one is as well completely escape free",
53+
"Welcome to Mozilla Firefox",
54+
"\\u0054\\u0068\\u0065\\u0073\\u0065 \\u0073\\u0065\\u0074\\u0074\\u0069\\u006e\\u0067\\u0073 \\u0061\\u0072\\u0065 \\u0074\\u0061\\u0069\\u006c\\u006f\\u0072\\u0065\\u0064 \\u0074\\u006f \\u0079\\u006f\\u0075\\u0072 \\u0063\\u006f\\u006d\\u0070\\u0075\\u0074\\u0065\\u0072\\u2019\\u0073 \\u0068\\u0061\\u0072\\u0064\\u0077\\u0061\\u0072\\u0065 \\u0061\\u006e\\u0064 \\u006f\\u0070\\u0065\\u0072\\u0061\\u0074\\u0069\\u006e\\u0067 \\u0073\\u0079\\u0073\\u0074\\u0065\\u006d\\u002e",
55+
"These settings are tailored to your computer’s hardware and operating system",
56+
"Use recommended performance settings",
57+
"\\u0041\\u0064\\u0064\\u0069\\u0074\\u0069\\u006f\\u006e\\u0061\\u006c \\u0063\\u006f\\u006e\\u0074\\u0065\\u006e\\u0074 \\u0070\\u0072\\u006f\\u0063\\u0065\\u0073\\u0073\\u0065\\u0073 \\u0063\\u0061\\u006e \\u0069\\u006d\\u0070\\u0072\\u006f\\u0076\\u0065 \\u0070\\u0065\\u0072\\u0066\\u006f\\u0072\\u006d\\u0061\\u006e\\u0063\\u0065 \\u0077\\u0068\\u0065\\u006e \\u0075\\u0073\\u0069\\u006e\\u0067 \\u006d\\u0075\\u006c\\u0074\\u0069\\u0070\\u006c\\u0065 \\u0074\\u0061\\u0062\\u0073\\u002c \\u0062\\u0075\\u0074 \\u0077\\u0069\\u006c\\u006c \\u0061\\u006c\\u0073\\u006f \\u0075\\u0073\\u0065 \\u006d\\u006f\\u0072\\u0065 \\u006d\\u0065\\u006d\\u006f\\u0072\\u0079\\u002e",
58+
"Additional content processes can improve performance when using multiple tabs, but will also use more memory.",
59+
];
60+
c.bench_function("unicode", move |b| {
61+
b.iter(|| {
62+
for s in strings {
63+
unescape_unicode(s);
64+
}
65+
})
66+
});
67+
}
68+
69+
criterion_group!(benches, parser_bench, unicode_unescape_bench);
4270
criterion_main!(benches);

fluent-syntax/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pub mod ast;
22
pub mod parser;
3+
pub mod unicode;

fluent-syntax/src/unicode.rs

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
use std::char;
2+
3+
fn encode_unicode(s: &str) -> char {
4+
u32::from_str_radix(s, 16)
5+
.ok()
6+
.and_then(char::from_u32)
7+
.unwrap_or('�')
8+
}
9+
10+
fn check_sequence(s: &mut std::iter::Enumerate<std::slice::Iter<'_, u8>>, len: usize) -> bool {
11+
for _ in 0..len {
12+
match s.next() {
13+
Some((_, b)) if b.is_ascii_hexdigit() => continue,
14+
_ => return false,
15+
}
16+
}
17+
true
18+
}
19+
20+
pub fn unescape_unicode(string: &str) -> Option<String> {
21+
let bytes = string.as_bytes();
22+
let mut result: Option<String> = None;
23+
24+
let mut iter = bytes.iter().enumerate();
25+
26+
while let Some((i, b)) = iter.next() {
27+
if b != &b'\\' {
28+
if let Some(ref mut result) = result {
29+
result.push(*b as char);
30+
}
31+
continue;
32+
}
33+
34+
let new_string = result.get_or_insert_with(|| String::from(&string[0..i]));
35+
36+
let new_char = match iter.next() {
37+
Some((_, b'\\')) => '\\',
38+
Some((_, b'"')) => '"',
39+
Some((_, u @ b'u')) | Some((_, u @ b'U')) => {
40+
let start = i + 2;
41+
let len = if u == &b'u' { 4 } else { 6 };
42+
if !check_sequence(&mut iter, len) {
43+
'�'
44+
} else {
45+
encode_unicode(&string[start..(start + len)])
46+
}
47+
}
48+
_ => unimplemented!(),
49+
};
50+
new_string.push(new_char);
51+
}
52+
return result;
53+
}

fluent-syntax/tests/ast/helper.rs

Lines changed: 0 additions & 67 deletions
This file was deleted.

fluent-syntax/tests/ast/mod.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
mod helper;
2-
31
use fluent_syntax::ast;
2+
use fluent_syntax::unicode::unescape_unicode;
43
use serde::ser::SerializeMap;
54
use serde::ser::SerializeSeq;
65
use serde::{Serialize, Serializer};
@@ -360,7 +359,11 @@ where
360359
let mut map = serializer.serialize_map(Some(3))?;
361360
map.serialize_entry("type", "StringLiteral")?;
362361
map.serialize_entry("raw", raw)?;
363-
map.serialize_entry("value", &helper::unescape_unicode(&raw))?;
362+
if let Some(unescaped_value) = unescape_unicode(&raw) {
363+
map.serialize_entry("value", &unescaped_value)?;
364+
} else {
365+
map.serialize_entry("value", &raw)?;
366+
}
364367
map.end()
365368
}
366369

fluent-syntax/tests/unicode.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
use fluent_syntax::unicode::unescape_unicode;
2+
3+
#[test]
4+
fn unescape_unicode_test() {
5+
assert_eq!(unescape_unicode("foo"), None);
6+
assert_eq!(unescape_unicode("foo \\\\"), Some("foo \\".to_owned()));
7+
assert_eq!(unescape_unicode("foo \\\""), Some("foo \"".to_owned()));
8+
assert_eq!(
9+
unescape_unicode("foo \\\\ faa"),
10+
Some("foo \\ faa".to_owned())
11+
);
12+
assert_eq!(
13+
unescape_unicode("foo \\\\ faa \\\\ fii"),
14+
Some("foo \\ faa \\ fii".to_owned())
15+
);
16+
assert_eq!(
17+
unescape_unicode("foo \\\\\\\" faa \\\"\\\\ fii"),
18+
Some("foo \\\" faa \"\\ fii".to_owned())
19+
);
20+
assert_eq!(unescape_unicode("\\u0041\\u004F"), Some("AO".to_owned()));
21+
assert_eq!(unescape_unicode("\\uA"), Some("�".to_owned()));
22+
}

0 commit comments

Comments
 (0)