Skip to content

Commit ff5b216

Browse files
authored
Add unicode escaping in resolver (#84)
* Add unicode escaping in resolver * Switch to Cow
1 parent 8067752 commit ff5b216

File tree

9 files changed

+126
-74
lines changed

9 files changed

+126
-74
lines changed

fluent-bundle/benches/resolver.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ fn get_ids(res: &FluentResource) -> Vec<String> {
3939
}
4040

4141
fn resolver_bench(c: &mut Criterion) {
42-
let tests = &["simple", "menubar"];
42+
let tests = &["simple", "menubar", "unescape"];
4343
let ftl_strings = get_strings(tests);
4444

4545
c.bench_function_over_inputs(

fluent-bundle/benches/unescape.ftl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
face-with-tears-of-joy = 😂
2+
tetragram-for-centre = 𝌆
3+
4+
surrogates-in-text = \uD83D\uDE02
5+
surrogates-in-string = {"\uD83D\uDE02"}
6+
surrogates-in-adjacent-strings = {"\uD83D"}{"\uDE02"}
7+
8+
emoji-in-text = A face 😂 with tears of joy.
9+
emoji-in-string = {"A face 😂 with tears of joy."}

fluent-bundle/src/resolve.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use super::bundle::FluentBundle;
1515
use super::entry::GetEntry;
1616
use super::types::FluentValue;
1717
use fluent_syntax::ast;
18+
use fluent_syntax::unicode::unescape_unicode;
1819

1920
#[derive(Debug, PartialEq)]
2021
pub enum ResolverError {
@@ -176,8 +177,7 @@ impl<'source> ResolveValue for ast::InlineExpression<'source> {
176177
fn to_value(&self, env: &Env) -> Result<FluentValue, ResolverError> {
177178
match self {
178179
ast::InlineExpression::StringLiteral { raw } => {
179-
// XXX: We need to decode the raw into unicode here.
180-
Ok(FluentValue::from(*raw))
180+
Ok(FluentValue::from(unescape_unicode(raw).into_owned()))
181181
}
182182
ast::InlineExpression::NumberLiteral { value } => {
183183
Ok(FluentValue::as_number(*value).unwrap())

fluent-syntax/benches/parser.rs

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use std::io;
77
use std::io::Read;
88

99
use fluent_syntax::parser::parse;
10+
use fluent_syntax::unicode::unescape_unicode;
1011

1112
fn read_file(path: &str) -> Result<String, io::Error> {
1213
let mut f = File::open(path)?;
@@ -38,5 +39,32 @@ fn parser_bench(c: &mut Criterion) {
3839
);
3940
}
4041

41-
criterion_group!(benches, parser_bench);
42+
fn unicode_unescape_bench(c: &mut Criterion) {
43+
let strings = &[
44+
"foo",
45+
"This is an example value",
46+
"Hello \\u00e3\\u00e9 World",
47+
"\\u004c\\u006f\\u0072\\u0065\\u006d \\u0069\\u0070\\u0073\\u0075\\u006d \\u0064\\u006f\\u006c\\u006f\\u0072 \\u0073\\u0069\\u0074 \\u0061\\u006d\\u0065\\u0074",
48+
"Let me introduce \\\"The\\\" Fluent",
49+
"And here's an example of \\\\ a character to be escaped",
50+
"But this message is completely unescape free",
51+
"And so is this one",
52+
"Maybe this one is as well completely escape free",
53+
"Welcome to Mozilla Firefox",
54+
"\\u0054\\u0068\\u0065\\u0073\\u0065 \\u0073\\u0065\\u0074\\u0074\\u0069\\u006e\\u0067\\u0073 \\u0061\\u0072\\u0065 \\u0074\\u0061\\u0069\\u006c\\u006f\\u0072\\u0065\\u0064 \\u0074\\u006f \\u0079\\u006f\\u0075\\u0072 \\u0063\\u006f\\u006d\\u0070\\u0075\\u0074\\u0065\\u0072\\u2019\\u0073 \\u0068\\u0061\\u0072\\u0064\\u0077\\u0061\\u0072\\u0065 \\u0061\\u006e\\u0064 \\u006f\\u0070\\u0065\\u0072\\u0061\\u0074\\u0069\\u006e\\u0067 \\u0073\\u0079\\u0073\\u0074\\u0065\\u006d\\u002e",
55+
"These settings are tailored to your computer’s hardware and operating system",
56+
"Use recommended performance settings",
57+
"\\u0041\\u0064\\u0064\\u0069\\u0074\\u0069\\u006f\\u006e\\u0061\\u006c \\u0063\\u006f\\u006e\\u0074\\u0065\\u006e\\u0074 \\u0070\\u0072\\u006f\\u0063\\u0065\\u0073\\u0073\\u0065\\u0073 \\u0063\\u0061\\u006e \\u0069\\u006d\\u0070\\u0072\\u006f\\u0076\\u0065 \\u0070\\u0065\\u0072\\u0066\\u006f\\u0072\\u006d\\u0061\\u006e\\u0063\\u0065 \\u0077\\u0068\\u0065\\u006e \\u0075\\u0073\\u0069\\u006e\\u0067 \\u006d\\u0075\\u006c\\u0074\\u0069\\u0070\\u006c\\u0065 \\u0074\\u0061\\u0062\\u0073\\u002c \\u0062\\u0075\\u0074 \\u0077\\u0069\\u006c\\u006c \\u0061\\u006c\\u0073\\u006f \\u0075\\u0073\\u0065 \\u006d\\u006f\\u0072\\u0065 \\u006d\\u0065\\u006d\\u006f\\u0072\\u0079\\u002e",
58+
"Additional content processes can improve performance when using multiple tabs, but will also use more memory.",
59+
];
60+
c.bench_function("unicode", move |b| {
61+
b.iter(|| {
62+
for s in strings {
63+
unescape_unicode(s);
64+
}
65+
})
66+
});
67+
}
68+
69+
criterion_group!(benches, parser_bench, unicode_unescape_bench);
4270
criterion_main!(benches);

fluent-syntax/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pub mod ast;
22
pub mod parser;
3+
pub mod unicode;

fluent-syntax/src/unicode.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
use std::borrow::Cow;
2+
use std::char;
3+
4+
fn encode_unicode(s: &str) -> char {
5+
u32::from_str_radix(s, 16)
6+
.ok()
7+
.and_then(char::from_u32)
8+
.unwrap_or('�')
9+
}
10+
11+
pub fn unescape_unicode<'u>(input: &'u str) -> Cow<'u, str> {
12+
let bytes = input.as_bytes();
13+
let mut result = Cow::from(input);
14+
15+
let mut ptr = 0;
16+
17+
while let Some(b) = bytes.get(ptr) {
18+
if b != &b'\\' {
19+
if let Cow::Owned(ref mut s) = result {
20+
s.push(*b as char);
21+
}
22+
ptr += 1;
23+
continue;
24+
}
25+
26+
if let Cow::Borrowed(_) = result {
27+
result = Cow::from(&input[0..ptr]);
28+
}
29+
30+
ptr += 1;
31+
32+
let new_char = match bytes.get(ptr) {
33+
Some(b'\\') => '\\',
34+
Some(b'"') => '"',
35+
Some(u @ b'u') | Some(u @ b'U') => {
36+
let start = ptr + 1;
37+
let len = if u == &b'u' { 4 } else { 6 };
38+
ptr += len;
39+
input
40+
.get(start..(start + len))
41+
.map(|slice| encode_unicode(slice))
42+
.unwrap_or('�')
43+
}
44+
_ => '�',
45+
};
46+
result.to_mut().push(new_char);
47+
ptr += 1;
48+
}
49+
result
50+
}

fluent-syntax/tests/ast/helper.rs

Lines changed: 0 additions & 67 deletions
This file was deleted.

fluent-syntax/tests/ast/mod.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
mod helper;
2-
31
use fluent_syntax::ast;
2+
use fluent_syntax::unicode::unescape_unicode;
43
use serde::ser::SerializeMap;
54
use serde::ser::SerializeSeq;
65
use serde::{Serialize, Serializer};
@@ -360,7 +359,7 @@ where
360359
let mut map = serializer.serialize_map(Some(3))?;
361360
map.serialize_entry("type", "StringLiteral")?;
362361
map.serialize_entry("raw", raw)?;
363-
map.serialize_entry("value", &helper::unescape_unicode(&raw))?;
362+
map.serialize_entry("value", &unescape_unicode(&raw))?;
364363
map.end()
365364
}
366365

fluent-syntax/tests/unicode.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
use fluent_syntax::unicode::unescape_unicode;
2+
use std::borrow::Cow;
3+
4+
fn is_cow_borrowed<'a>(input: Cow<'a, str>) -> bool {
5+
if let Cow::Borrowed(_) = input {
6+
true
7+
} else {
8+
false
9+
}
10+
}
11+
12+
#[test]
13+
fn unescape_unicode_test() {
14+
assert!(is_cow_borrowed(unescape_unicode("foo")));
15+
16+
assert_eq!(unescape_unicode("foo"), "foo");
17+
assert_eq!(unescape_unicode("foo \\\\"), "foo \\");
18+
assert_eq!(unescape_unicode("foo \\\""), "foo \"");
19+
assert_eq!(unescape_unicode("foo \\\\ faa"), "foo \\ faa");
20+
assert_eq!(
21+
unescape_unicode("foo \\\\ faa \\\\ fii"),
22+
"foo \\ faa \\ fii"
23+
);
24+
assert_eq!(
25+
unescape_unicode("foo \\\\\\\" faa \\\"\\\\ fii"),
26+
"foo \\\" faa \"\\ fii"
27+
);
28+
assert_eq!(unescape_unicode("\\u0041\\u004F"), "AO");
29+
assert_eq!(unescape_unicode("\\uA"), "�");
30+
assert_eq!(unescape_unicode("\\uA0Pl"), "�");
31+
assert_eq!(unescape_unicode("\\d Foo"), "� Foo");
32+
}

0 commit comments

Comments
 (0)