Fix byte offsets after normalization

Rantanen · Rantanen · commit dbdfa6ef1d30 · 2019-10-04T22:38:12.000+03:00
diff --git a/src/librustc/ich/impls_syntax.rs b/src/librustc/ich/impls_syntax.rs
@@ -424,6 +424,7 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
             ref lines,
             ref multibyte_chars,
             ref non_narrow_chars,
+            ref normalized_pos,
         } = *self;
 
         (name_hash as u64).hash_stable(hcx, hasher);
@@ -452,6 +453,12 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
         for &char_pos in non_narrow_chars.iter() {
             stable_non_narrow_char(char_pos, start_pos).hash_stable(hcx, hasher);
         }
+
+        normalized_pos.len().hash_stable(hcx, hasher);
+        for &char_pos in normalized_pos.iter() {
+            stable_normalized_pos(char_pos, start_pos).hash_stable(hcx, hasher);
+        }
+
     }
 }
 
@@ -481,6 +488,18 @@ fn stable_non_narrow_char(swc: ::syntax_pos::NonNarrowChar,
     (pos.0 - source_file_start.0, width as u32)
 }
 
+fn stable_normalized_pos(np: ::syntax_pos::NormalizedPos,
+                         source_file_start: ::syntax_pos::BytePos)
+                         -> (u32, u32) {
+    let ::syntax_pos::NormalizedPos {
+        pos,
+        diff
+    } = np;
+
+    (pos.0 - source_file_start.0, diff as u32)
+}
+
+
 impl<'tcx> HashStable<StableHashingContext<'tcx>> for feature_gate::Features {
     fn hash_stable(&self, hcx: &mut StableHashingContext<'tcx>, hasher: &mut StableHasher) {
         // Unfortunately we cannot exhaustively list fields here, since the
diff --git a/src/librustc_metadata/decoder.rs b/src/librustc_metadata/decoder.rs
@@ -1330,6 +1330,7 @@ impl<'a, 'tcx> CrateMetadata {
                                       mut lines,
                                       mut multibyte_chars,
                                       mut non_narrow_chars,
+                                      mut normalized_pos,
                                       name_hash,
                                       .. } = source_file_to_import;
 
@@ -1349,6 +1350,9 @@ impl<'a, 'tcx> CrateMetadata {
             for swc in &mut non_narrow_chars {
                 *swc = *swc - start_pos;
             }
+            for np in &mut normalized_pos {
+                np.pos = np.pos - start_pos;
+            }
 
             let local_version = local_source_map.new_imported_source_file(name,
                                                                    name_was_remapped,
@@ -1358,7 +1362,8 @@ impl<'a, 'tcx> CrateMetadata {
                                                                    source_length,
                                                                    lines,
                                                                    multibyte_chars,
-                                                                   non_narrow_chars);
+                                                                   non_narrow_chars,
+                                                                   normalized_pos);
             debug!("CrateMetaData::imported_source_files alloc \
                     source_file {:?} original (start_pos {:?} end_pos {:?}) \
                     translated (start_pos {:?} end_pos {:?})",
diff --git a/src/libsyntax/json.rs b/src/libsyntax/json.rs
@@ -333,10 +333,24 @@ impl DiagnosticSpan {
             })
         });
 
+        let file_np = &start.file.normalized_pos;
+        let np_start_diff = match file_np.binary_search_by(
+                            |np| np.pos.cmp(&span.lo())) {
+            Ok(i) => file_np[i].diff,
+            Err(i) if i == 0 => 0,
+            Err(i) => file_np[i-1].diff,
+        };
+        let np_end_diff = match file_np.binary_search_by(
+                            |np| np.pos.cmp(&span.hi())) {
+            Ok(i) => file_np[i].diff,
+            Err(i) if i == 0 => 0,
+            Err(i) => file_np[i-1].diff,
+        };
+
         DiagnosticSpan {
             file_name: start.file.name.to_string(),
-            byte_start: span.lo().0 - start.file.start_pos.0,
-            byte_end: span.hi().0 - start.file.start_pos.0,
+            byte_start: span.lo().0 - start.file.start_pos.0 + np_start_diff,
+            byte_end: span.hi().0 - start.file.start_pos.0 + np_end_diff,
             line_start: start.line,
             line_end: end.line,
             column_start: start.col.0 + 1,
diff --git a/src/libsyntax/source_map.rs b/src/libsyntax/source_map.rs
@@ -283,6 +283,7 @@ impl SourceMap {
         mut file_local_lines: Vec<BytePos>,
         mut file_local_multibyte_chars: Vec<MultiByteChar>,
         mut file_local_non_narrow_chars: Vec<NonNarrowChar>,
+        mut file_local_normalized_pos: Vec<NormalizedPos>,
     ) -> Lrc<SourceFile> {
         let start_pos = self.next_start_pos();
 
@@ -301,6 +302,10 @@ impl SourceMap {
             *swc = *swc + start_pos;
         }
 
+        for nc in &mut file_local_normalized_pos {
+            nc.pos = nc.pos + start_pos;
+        }
+
         let source_file = Lrc::new(SourceFile {
             name: filename,
             name_was_remapped,
@@ -314,6 +319,7 @@ impl SourceMap {
             lines: file_local_lines,
             multibyte_chars: file_local_multibyte_chars,
             non_narrow_chars: file_local_non_narrow_chars,
+            normalized_pos: file_local_normalized_pos,
             name_hash,
         });
 
diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs
@@ -849,6 +849,15 @@ impl Sub<BytePos> for NonNarrowChar {
     }
 }
 
+/// Identifies an offset of a character that was normalized away from `SourceFile`.
+#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq, Debug)]
+pub struct NormalizedPos {
+    /// The absolute offset of the character in the `SourceMap`.
+    pub pos: BytePos,
+    /// The difference between original and normalized string at position.
+    pub diff: u32,
+}
+
 /// The state of the lazy external source loading mechanism of a `SourceFile`.
 #[derive(PartialEq, Eq, Clone)]
 pub enum ExternalSource {
@@ -912,6 +921,8 @@ pub struct SourceFile {
     pub multibyte_chars: Vec<MultiByteChar>,
     /// Width of characters that are not narrow in the source code.
     pub non_narrow_chars: Vec<NonNarrowChar>,
+    /// Locations of characters removed during normalization.
+    pub normalized_pos: Vec<NormalizedPos>,
     /// A hash of the filename, used for speeding up hashing in incremental compilation.
     pub name_hash: u128,
 }
@@ -978,6 +989,9 @@ impl Encodable for SourceFile {
             })?;
             s.emit_struct_field("name_hash", 8, |s| {
                 self.name_hash.encode(s)
+            })?;
+            s.emit_struct_field("normalized_pos", 9, |s| {
+                self.normalized_pos.encode(s)
             })
         })
     }
@@ -1028,6 +1042,8 @@ impl Decodable for SourceFile {
                 d.read_struct_field("non_narrow_chars", 7, |d| Decodable::decode(d))?;
             let name_hash: u128 =
                 d.read_struct_field("name_hash", 8, |d| Decodable::decode(d))?;
+            let normalized_pos: Vec<NormalizedPos> =
+                d.read_struct_field("normalized_pos", 9, |d| Decodable::decode(d))?;
             Ok(SourceFile {
                 name,
                 name_was_remapped,
@@ -1044,6 +1060,7 @@ impl Decodable for SourceFile {
                 lines,
                 multibyte_chars,
                 non_narrow_chars,
+                normalized_pos,
                 name_hash,
             })
         })
@@ -1062,8 +1079,7 @@ impl SourceFile {
                unmapped_path: FileName,
                mut src: String,
                start_pos: BytePos) -> Result<SourceFile, OffsetOverflowError> {
-        remove_bom(&mut src);
-        normalize_newlines(&mut src);
+        let normalized_pos = normalize_src(&mut src);
 
         let src_hash = {
             let mut hasher: StableHasher = StableHasher::new();
@@ -1096,6 +1112,7 @@ impl SourceFile {
             lines,
             multibyte_chars,
             non_narrow_chars,
+            normalized_pos,
             name_hash,
         })
     }
@@ -1224,18 +1241,27 @@ impl SourceFile {
     }
 }
 
+/// Normalizes the source code and records the normalizations.
+fn normalize_src(src: &mut String) -> Vec<NormalizedPos> {
+    let mut normalized_pos = vec![];
+    remove_bom(src, &mut normalized_pos);
+    normalize_newlines(src, &mut normalized_pos);
+    normalized_pos
+}
+
 /// Removes UTF-8 BOM, if any.
-fn remove_bom(src: &mut String) {
+fn remove_bom(src: &mut String, normalized_pos: &mut Vec<NormalizedPos>) {
     if src.starts_with("\u{feff}") {
         src.drain(..3);
+        normalized_pos.push(NormalizedPos { pos: BytePos(0), diff: 3 });
     }
 }
 
 
 /// Replaces `\r\n` with `\n` in-place in `src`.
 ///
 /// Returns error if there's a lone `\r` in the string
-fn normalize_newlines(src: &mut String) {
+fn normalize_newlines(src: &mut String, normalized_pos: &mut Vec<NormalizedPos>) {
     if !src.as_bytes().contains(&b'\r') {
         return;
     }
@@ -1248,6 +1274,8 @@ fn normalize_newlines(src: &mut String) {
     let mut buf = std::mem::replace(src, String::new()).into_bytes();
     let mut gap_len = 0;
     let mut tail = buf.as_mut_slice();
+    let mut cursor = 0;
+    let original_gap = normalized_pos.last().map_or(0, |l| l.diff);
     loop {
         let idx = match find_crlf(&tail[gap_len..]) {
             None => tail.len(),
@@ -1258,7 +1286,12 @@ fn normalize_newlines(src: &mut String) {
         if tail.len() == gap_len {
             break;
         }
+        cursor += idx - gap_len;
         gap_len += 1;
+        normalized_pos.push(NormalizedPos {
+            pos: BytePos::from_usize(cursor + 1),
+            diff: original_gap + gap_len as u32,
+        });
     }
 
     // Account for removed `\r`.
diff --git a/src/libsyntax_pos/tests.rs b/src/libsyntax_pos/tests.rs
@@ -19,20 +19,25 @@ fn test_lookup_line() {
 
 #[test]
 fn test_normalize_newlines() {
-    fn check(before: &str, after: &str) {
+    fn check(before: &str, after: &str, expected_positions: &[u32]) {
         let mut actual = before.to_string();
-        normalize_newlines(&mut actual);
+        let mut actual_positions = vec![];
+        normalize_newlines(&mut actual, &mut actual_positions);
+        let actual_positions : Vec<_> = actual_positions
+            .into_iter()
+            .map(|nc| nc.pos.0).collect();
         assert_eq!(actual.as_str(), after);
+        assert_eq!(actual_positions, expected_positions);
     }
-    check("", "");
-    check("\n", "\n");
-    check("\r", "\r");
-    check("\r\r", "\r\r");
-    check("\r\n", "\n");
-    check("hello world", "hello world");
-    check("hello\nworld", "hello\nworld");
-    check("hello\r\nworld", "hello\nworld");
-    check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n");
-    check("\r\r\n", "\r\n");
-    check("hello\rworld", "hello\rworld");
+    check("", "", &[]);
+    check("\n", "\n", &[]);
+    check("\r", "\r", &[]);
+    check("\r\r", "\r\r", &[]);
+    check("\r\n", "\n", &[1]);
+    check("hello world", "hello world", &[]);
+    check("hello\nworld", "hello\nworld", &[]);
+    check("hello\r\nworld", "hello\nworld", &[6]);
+    check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n", &[1, 7, 13]);
+    check("\r\r\n", "\r\n", &[2]);
+    check("hello\rworld", "hello\rworld", &[]);
 }