Add unicode escaping in resolver

Zibi Braniecki · Zibi Braniecki · commit 19c810764720 · 2019-01-26T17:33:49.000-08:00
diff --git a/fluent-bundle/benches/resolver.rs b/fluent-bundle/benches/resolver.rs
@@ -39,7 +39,7 @@ fn get_ids(res: &FluentResource) -> Vec<String> {
 }
 
 fn resolver_bench(c: &mut Criterion) {
-    let tests = &["simple", "menubar"];
+    let tests = &["simple", "menubar", "unescape"];
     let ftl_strings = get_strings(tests);
 
     c.bench_function_over_inputs(
diff --git a/fluent-bundle/benches/unescape.ftl b/fluent-bundle/benches/unescape.ftl
@@ -0,0 +1,9 @@
+face-with-tears-of-joy = 😂
+tetragram-for-centre = 𝌆
+
+surrogates-in-text = \uD83D\uDE02
+surrogates-in-string = {"\uD83D\uDE02"}
+surrogates-in-adjacent-strings = {"\uD83D"}{"\uDE02"}
+
+emoji-in-text = A face 😂 with tears of joy.
+emoji-in-string = {"A face 😂 with tears of joy."}
diff --git a/fluent-bundle/src/resolve.rs b/fluent-bundle/src/resolve.rs
@@ -15,6 +15,7 @@ use super::bundle::FluentBundle;
 use super::entry::GetEntry;
 use super::types::FluentValue;
 use fluent_syntax::ast;
+use fluent_syntax::unicode::unescape_unicode;
 
 #[derive(Debug, PartialEq)]
 pub enum ResolverError {
@@ -176,8 +177,11 @@ impl<'source> ResolveValue for ast::InlineExpression<'source> {
     fn to_value(&self, env: &Env) -> Result<FluentValue, ResolverError> {
         match self {
             ast::InlineExpression::StringLiteral { raw } => {
-                // XXX: We need to decode the raw into unicode here.
-                Ok(FluentValue::from(*raw))
+                if let Some(unescaped_value) = unescape_unicode(raw) {
+                    Ok(FluentValue::from(unescaped_value))
+                } else {
+                    Ok(FluentValue::from(*raw))
+                }
             }
             ast::InlineExpression::NumberLiteral { value } => {
                 Ok(FluentValue::as_number(*value).unwrap())
diff --git a/fluent-syntax/benches/parser.rs b/fluent-syntax/benches/parser.rs
@@ -7,6 +7,7 @@ use std::io;
 use std::io::Read;
 
 use fluent_syntax::parser::parse;
+use fluent_syntax::unicode::unescape_unicode;
 
 fn read_file(path: &str) -> Result<String, io::Error> {
     let mut f = File::open(path)?;
@@ -38,5 +39,32 @@ fn parser_bench(c: &mut Criterion) {
     );
 }
 
-criterion_group!(benches, parser_bench);
+fn unicode_unescape_bench(c: &mut Criterion) {
+    let strings = &[
+        "foo",
+        "This is an example value",
+        "Hello \\u00e3\\u00e9 World",
+        "\\u004c\\u006f\\u0072\\u0065\\u006d \\u0069\\u0070\\u0073\\u0075\\u006d \\u0064\\u006f\\u006c\\u006f\\u0072 \\u0073\\u0069\\u0074 \\u0061\\u006d\\u0065\\u0074",
+        "Let me introduce \\\"The\\\" Fluent",
+        "And here's an example of \\\\ a character to be escaped",
+        "But this message is completely unescape free",
+        "And so is this one",
+        "Maybe this one is as well completely escape free",
+        "Welcome to Mozilla Firefox",
+        "\\u0054\\u0068\\u0065\\u0073\\u0065 \\u0073\\u0065\\u0074\\u0074\\u0069\\u006e\\u0067\\u0073 \\u0061\\u0072\\u0065 \\u0074\\u0061\\u0069\\u006c\\u006f\\u0072\\u0065\\u0064 \\u0074\\u006f \\u0079\\u006f\\u0075\\u0072 \\u0063\\u006f\\u006d\\u0070\\u0075\\u0074\\u0065\\u0072\\u2019\\u0073 \\u0068\\u0061\\u0072\\u0064\\u0077\\u0061\\u0072\\u0065 \\u0061\\u006e\\u0064 \\u006f\\u0070\\u0065\\u0072\\u0061\\u0074\\u0069\\u006e\\u0067 \\u0073\\u0079\\u0073\\u0074\\u0065\\u006d\\u002e",
+        "These settings are tailored to your computer’s hardware and operating system",
+        "Use recommended performance settings",
+        "\\u0041\\u0064\\u0064\\u0069\\u0074\\u0069\\u006f\\u006e\\u0061\\u006c \\u0063\\u006f\\u006e\\u0074\\u0065\\u006e\\u0074 \\u0070\\u0072\\u006f\\u0063\\u0065\\u0073\\u0073\\u0065\\u0073 \\u0063\\u0061\\u006e \\u0069\\u006d\\u0070\\u0072\\u006f\\u0076\\u0065 \\u0070\\u0065\\u0072\\u0066\\u006f\\u0072\\u006d\\u0061\\u006e\\u0063\\u0065 \\u0077\\u0068\\u0065\\u006e \\u0075\\u0073\\u0069\\u006e\\u0067 \\u006d\\u0075\\u006c\\u0074\\u0069\\u0070\\u006c\\u0065 \\u0074\\u0061\\u0062\\u0073\\u002c \\u0062\\u0075\\u0074 \\u0077\\u0069\\u006c\\u006c \\u0061\\u006c\\u0073\\u006f \\u0075\\u0073\\u0065 \\u006d\\u006f\\u0072\\u0065 \\u006d\\u0065\\u006d\\u006f\\u0072\\u0079\\u002e",
+        "Additional content processes can improve performance when using multiple tabs, but will also use more memory.",
+    ];
+    c.bench_function("unicode", move |b| {
+        b.iter(|| {
+            for s in strings {
+                unescape_unicode(s);
+            }
+        })
+    });
+}
+
+criterion_group!(benches, parser_bench, unicode_unescape_bench);
 criterion_main!(benches);
diff --git a/fluent-syntax/src/lib.rs b/fluent-syntax/src/lib.rs
@@ -1,2 +1,3 @@
 pub mod ast;
 pub mod parser;
+pub mod unicode;
diff --git a/fluent-syntax/src/unicode.rs b/fluent-syntax/src/unicode.rs
@@ -0,0 +1,53 @@
+use std::char;
+
+fn encode_unicode(s: &str) -> char {
+    u32::from_str_radix(s, 16)
+        .ok()
+        .and_then(char::from_u32)
+        .unwrap_or('�')
+}
+
+fn check_sequence(s: &mut std::iter::Enumerate<std::slice::Iter<'_, u8>>, len: usize) -> bool {
+    for _ in 0..len {
+        match s.next() {
+            Some((_, b)) if b.is_ascii_hexdigit() => continue,
+            _ => return false,
+        }
+    }
+    true
+}
+
+pub fn unescape_unicode(string: &str) -> Option<String> {
+    let bytes = string.as_bytes();
+    let mut result: Option<String> = None;
+
+    let mut iter = bytes.iter().enumerate();
+
+    while let Some((i, b)) = iter.next() {
+        if b != &b'\\' {
+            if let Some(ref mut result) = result {
+                result.push(*b as char);
+            }
+            continue;
+        }
+
+        let new_string = result.get_or_insert_with(|| String::from(&string[0..i]));
+
+        let new_char = match iter.next() {
+            Some((_, b'\\')) => '\\',
+            Some((_, b'"')) => '"',
+            Some((_, u @ b'u')) | Some((_, u @ b'U')) => {
+                let start = i + 2;
+                let len = if u == &b'u' { 4 } else { 6 };
+                if !check_sequence(&mut iter, len) {
+                    '�'
+                } else {
+                    encode_unicode(&string[start..(start + len)])
+                }
+            }
+            _ => unimplemented!(),
+        };
+        new_string.push(new_char);
+    }
+    return result;
+}
diff --git a/fluent-syntax/tests/ast/helper.rs b/fluent-syntax/tests/ast/helper.rs
diff --git a/fluent-syntax/tests/ast/mod.rs b/fluent-syntax/tests/ast/mod.rs
@@ -1,6 +1,5 @@
-mod helper;
-
 use fluent_syntax::ast;
+use fluent_syntax::unicode::unescape_unicode;
 use serde::ser::SerializeMap;
 use serde::ser::SerializeSeq;
 use serde::{Serialize, Serializer};
@@ -360,7 +359,11 @@ where
     let mut map = serializer.serialize_map(Some(3))?;
     map.serialize_entry("type", "StringLiteral")?;
     map.serialize_entry("raw", raw)?;
-    map.serialize_entry("value", &helper::unescape_unicode(&raw))?;
+    if let Some(unescaped_value) = unescape_unicode(&raw) {
+        map.serialize_entry("value", &unescaped_value)?;
+    } else {
+        map.serialize_entry("value", &raw)?;
+    }
     map.end()
 }
 
diff --git a/fluent-syntax/tests/unicode.rs b/fluent-syntax/tests/unicode.rs
@@ -0,0 +1,22 @@
+use fluent_syntax::unicode::unescape_unicode;
+
+#[test]
+fn unescape_unicode_test() {
+    assert_eq!(unescape_unicode("foo"), None);
+    assert_eq!(unescape_unicode("foo \\\\"), Some("foo \\".to_owned()));
+    assert_eq!(unescape_unicode("foo \\\""), Some("foo \"".to_owned()));
+    assert_eq!(
+        unescape_unicode("foo \\\\ faa"),
+        Some("foo \\ faa".to_owned())
+    );
+    assert_eq!(
+        unescape_unicode("foo \\\\ faa \\\\ fii"),
+        Some("foo \\ faa \\ fii".to_owned())
+    );
+    assert_eq!(
+        unescape_unicode("foo \\\\\\\" faa \\\"\\\\ fii"),
+        Some("foo \\\" faa \"\\ fii".to_owned())
+    );
+    assert_eq!(unescape_unicode("\\u0041\\u004F"), Some("AO".to_owned()));
+    assert_eq!(unescape_unicode("\\uA"), Some("�".to_owned()));
+}

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ fn get_ids(res: &FluentResource) -> Vec<String> {`
`39`	`39`	`}`
`40`	`40`
`41`	`41`	`fn resolver_bench(c: &mut Criterion) {`
`42`		`- let tests = &["simple", "menubar"];`
	`42`	`+ let tests = &["simple", "menubar", "unescape"];`
`43`	`43`	`let ftl_strings = get_strings(tests);`
`44`	`44`
`45`	`45`	`c.bench_function_over_inputs(`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`pub mod ast;`
`2`	`2`	`pub mod parser;`
	`3`	`+pub mod unicode;`