feat: make frontend normalize line endings to LF (#3903)

kmill · web-flow · commit a7338c5ad887 · 2024-05-20T17:13:08.000Z
To eliminate parsing differences between Windows and other platforms, the frontend now normalizes all CRLF line endings to LF, like [in Rust](rust-lang/rust#62865). Effects: - This makes Lake hashes be faithful to what Lean sees (Lake already normalizes line endings before computing hashes). - Docstrings now have normalized line endings. In particular, this fixes `#guard_msgs` failing multiline tests for Windows users using CRLF. - Now strings don't have different lengths depending on the platform. Before this PR, the following theorem is true for LF and false for CRLF files. ```lean example : " ".length = 1 := rfl ``` Note: the normalization will take `\r\r\n` and turn it into `\r\n`. In the elaborator, we reject loose `\r`'s that appear in whitespace. Rust instead takes the approach of making the normalization routine fail. They do this so that there's no downstream confusion about any `\r\n` that appears. Implementation note: the LSP maintains its own copy of a source file that it updates when edit operations are applied. We are assuming that edit operations never split or join CRLFs. If this assumption is not correct, then the LSP copy of a source file can become slightly out of sync. If this is an issue, there is some discussion [here](#3903 (comment)).
diff --git a/src/Init/Data/String/Extra.lean b/src/Init/Data/String/Extra.lean
@@ -198,4 +198,35 @@ def removeLeadingSpaces (s : String) : String :=
   let n := findLeadingSpacesSize s
   if n == 0 then s else removeNumLeadingSpaces n s
 
+/--
+Replaces each `\r\n` with `\n` to normalize line endings,
+but does not validate that there are no isolated `\r` characters.
+It is an optimized version of `String.replace text "\r\n" "\n"`.
+-/
+def crlfToLf (text : String) : String :=
+  go "" 0 0
+where
+  go (acc : String) (accStop pos : String.Pos) : String :=
+    if h : text.atEnd pos then
+      -- note: if accStop = 0 then acc is empty
+      if accStop = 0 then text else acc ++ text.extract accStop pos
+    else
+      let c := text.get' pos h
+      let pos' := text.next' pos h
+      if h' : ¬ text.atEnd pos' ∧ c == '\r' ∧ text.get pos' == '\n' then
+        let acc := acc ++ text.extract accStop pos
+        go acc pos' (text.next' pos' h'.1)
+      else
+        go acc accStop pos'
+  termination_by text.utf8ByteSize - pos.byteIdx
+  decreasing_by
+    decreasing_with
+      show text.utf8ByteSize - (text.next' (text.next' pos _) _).byteIdx < text.utf8ByteSize - pos.byteIdx
+      have k := Nat.gt_of_not_le <| mt decide_eq_true h
+      exact Nat.sub_lt_sub_left k (Nat.lt_trans (String.lt_next text pos) (String.lt_next _ _))
+    decreasing_with
+      show text.utf8ByteSize - (text.next' pos _).byteIdx < text.utf8ByteSize - pos.byteIdx
+      have k := Nat.gt_of_not_le <| mt decide_eq_true h
+      exact Nat.sub_lt_sub_left k (String.lt_next _ _)
+
 end String
diff --git a/src/Lean/Data/Position.lean b/src/Lean/Data/Position.lean
@@ -106,7 +106,7 @@ def ofPosition (text : FileMap) (pos : Position) : String.Pos :=
 
 /--
 Returns the position of the start of (1-based) line `line`.
-This gives the stame result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
+This gives the same result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
 -/
 def lineStart (map : FileMap) (line : Nat) : String.Pos :=
   if h : line - 1 < map.positions.size then
diff --git a/src/Lean/Parser/Basic.lean b/src/Lean/Parser/Basic.lean
@@ -558,6 +558,8 @@ partial def whitespace : ParserFn := fun c s =>
     let curr := input.get' i h
     if curr == '\t' then
       s.mkUnexpectedError (pushMissing := false) "tabs are not allowed; please configure your editor to expand them"
+    else if curr == '\r' then
+      s.mkUnexpectedError (pushMissing := false) "isolated carriage returns are not allowed"
     else if curr.isWhitespace then whitespace c (s.next' input i h)
     else if curr == '-' then
       let i    := input.next' i h
@@ -621,9 +623,8 @@ def hexDigitFn : ParserFn := fun c s =>
 /--
 Parses the whitespace after the `\` when there is a string gap.
 Raises an error if the whitespace does not contain exactly one newline character.
-Processes `\r\n` as a newline.
 -/
-partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
+partial def stringGapFn (seenNewline : Bool) : ParserFn := fun c s =>
   let i := s.pos
   if h : c.input.atEnd i then s -- let strLitFnAux handle the EOI error if !seenNewline
   else
@@ -633,13 +634,9 @@ partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
         -- Having more than one newline in a string gap is visually confusing
         s.mkUnexpectedError "unexpected additional newline in string gap"
       else
-        stringGapFn true false c (s.next' c.input i h)
-    else if curr == '\r' then
-      stringGapFn seenNewline true c (s.next' c.input i h)
-    else if afterCR then
-      s.mkUnexpectedError "expecting newline after carriage return"
+        stringGapFn true c (s.next' c.input i h)
     else if curr.isWhitespace then
-      stringGapFn seenNewline false c (s.next' c.input i h)
+      stringGapFn seenNewline c (s.next' c.input i h)
     else if seenNewline then
       s
     else
@@ -663,8 +660,8 @@ def quotedCharCoreFn (isQuotable : Char → Bool) (inString : Bool) : ParserFn :
       andthenFn hexDigitFn hexDigitFn c (s.next' input i h)
     else if curr == 'u' then
       andthenFn hexDigitFn (andthenFn hexDigitFn (andthenFn hexDigitFn hexDigitFn)) c (s.next' input i h)
-    else if inString && (curr == '\n' || curr == '\r') then
-      stringGapFn false false c s
+    else if inString && curr == '\n' then
+      stringGapFn false c s
     else
       s.mkUnexpectedError "invalid escape sequence"
 
diff --git a/src/Lean/Parser/Extension.lean b/src/Lean/Parser/Extension.lean
@@ -437,11 +437,12 @@ def getSyntaxNodeKinds (env : Environment) : List SyntaxNodeKind :=
 def getTokenTable (env : Environment) : TokenTable :=
   (parserExtension.getState env).tokens
 
-def mkInputContext (input : String) (fileName : String) : InputContext := {
-  input    := input,
-  fileName := fileName,
-  fileMap  := input.toFileMap
-}
+-- Note: `crlfToLf` preserves logical line and column numbers for each character.
+def mkInputContext (input : String) (fileName : String) (normalizeLineEndings := true) : InputContext :=
+  let input' := if normalizeLineEndings then input.crlfToLf else input
+  { input    := input',
+    fileName := fileName,
+    fileMap  := input'.toFileMap }
 
 def mkParserState (input : String) : ParserState :=
   { cache := initCacheForInput input }
@@ -453,7 +454,7 @@ def runParserCategory (env : Environment) (catName : Name) (input : String) (fil
   let s := p.run ictx { env, options := {} } (getTokenTable env) (mkParserState input)
   if !s.allErrors.isEmpty  then
     Except.error (s.toErrorMsg ictx)
-  else if input.atEnd s.pos then
+  else if ictx.input.atEnd s.pos then
     Except.ok s.stxStack.back
   else
     Except.error ((s.mkError "end of input").toErrorMsg ictx)
diff --git a/src/Lean/Server/FileWorker.lean b/src/Lean/Server/FileWorker.lean
@@ -705,12 +705,9 @@ def initAndRunWorker (i o e : FS.Stream) (opts : Options) : IO UInt32 := do
   let initParams ← i.readLspRequestAs "initialize" InitializeParams
   let ⟨_, param⟩ ← i.readLspNotificationAs "textDocument/didOpen" LeanDidOpenTextDocumentParams
   let doc := param.textDocument
-  /- NOTE(WN): `toFileMap` marks line beginnings as immediately following
-    "\n", which should be enough to handle both LF and CRLF correctly.
-    This is because LSP always refers to characters by (line, column),
-    so if we get the line number correct it shouldn't matter that there
-    is a CR there. -/
-  let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.toFileMap, param.dependencyBuildMode?.getD .always⟩
+  /- Note (kmill): LSP always refers to characters by (line, column),
+     so converting CRLF to LF preserves line and column numbers. -/
+  let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, param.dependencyBuildMode?.getD .always⟩
   let e := e.withPrefix s!"[{param.textDocument.uri}] "
   let _ ← IO.setStderr e
   let (ctx, st) ← try
diff --git a/src/Lean/Server/Utils.lean b/src/Lean/Server/Utils.lean
@@ -75,7 +75,10 @@ structure DocumentMeta where
   uri                 : Lsp.DocumentUri
   /-- Version number of the document. Incremented whenever the document is edited. -/
   version             : Nat
-  /-- Current text of the document. -/
+  /--
+  Current text of the document.
+  It is maintained such that it is normalized using `String.crlfToLf`, which preserves logical line/column numbers.
+  (Note: we assume that edit operations never split or merge `\r\n` line endings.) -/
   text                : FileMap
   /--
   Controls when dependencies of the document are built on `textDocument/didOpen` notifications.
@@ -98,7 +101,11 @@ def replaceLspRange (text : FileMap) (r : Lsp.Range) (newText : String) : FileMa
   let «end» := text.lspPosToUtf8Pos r.«end»
   let pre := text.source.extract 0 start
   let post := text.source.extract «end» text.source.endPos
-  (pre ++ newText ++ post).toFileMap
+  -- `pre` and `post` already have normalized line endings, so only `newText` needs its endings normalized.
+  -- Note: this assumes that editing never separates a `\r\n`.
+  -- If `pre` ends with `\r` and `newText` begins with `\n`, the result is potentially inaccurate.
+  -- If this is ever a problem, we could store a second unnormalized FileMap, edit it, and normalize it here.
+  (pre ++ newText.crlfToLf ++ post).toFileMap
 
 open IO
 
@@ -125,7 +132,7 @@ def applyDocumentChange (oldText : FileMap) : (change : Lsp.TextDocumentContentC
   | TextDocumentContentChangeEvent.rangeChange (range : Range) (newText : String) =>
     replaceLspRange oldText range newText
   | TextDocumentContentChangeEvent.fullChange (newText : String) =>
-    newText.toFileMap
+    newText.crlfToLf.toFileMap
 
 /-- Returns the document contents with all changes applied. -/
 def foldDocumentChanges (changes : Array Lsp.TextDocumentContentChangeEvent) (oldText : FileMap) : FileMap :=
diff --git a/src/Lean/Server/Watchdog.lean b/src/Lean/Server/Watchdog.lean
@@ -728,12 +728,9 @@ end RequestHandling
 section NotificationHandling
   def handleDidOpen (p : LeanDidOpenTextDocumentParams) : ServerM Unit :=
     let doc := p.textDocument
-    /- NOTE(WN): `toFileMap` marks line beginnings as immediately following
-       "\n", which should be enough to handle both LF and CRLF correctly.
-       This is because LSP always refers to characters by (line, column),
-       so if we get the line number correct it shouldn't matter that there
-       is a CR there. -/
-    startFileWorker ⟨doc.uri, doc.version, doc.text.toFileMap, p.dependencyBuildMode?.getD .always⟩
+    /- Note (kmill): LSP always refers to characters by (line, column),
+      so converting CRLF to LF preserves line and column numbers. -/
+    startFileWorker ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, p.dependencyBuildMode?.getD .always⟩
 
   def handleDidChange (p : DidChangeTextDocumentParams) : ServerM Unit := do
     let doc := p.textDocument
diff --git a/src/lake/Lake/Build/Trace.lean b/src/lake/Lake/Build/Trace.lean
@@ -3,7 +3,6 @@ Copyright (c) 2021 Mac Malone. All rights reserved.
 Released under Apache 2.0 license as described in the file LICENSE.
 Authors: Mac Malone
 -/
-import Lake.Util.Newline
 
 open System
 namespace Lake
@@ -135,7 +134,7 @@ instance : ComputeHash FilePath IO := ⟨computeFileHash⟩
 
 def computeTextFileHash (file : FilePath) : IO Hash := do
   let text ← IO.FS.readFile file
-  let text := crlf2lf text
+  let text := text.crlfToLf
   return Hash.ofString text
 
 /--
diff --git a/src/lake/Lake/Util/Newline.lean b/src/lake/Lake/Util/Newline.lean
diff --git a/src/lake/tests/toml/Test.lean b/src/lake/tests/toml/Test.lean
@@ -5,7 +5,6 @@ Authors: Mac Malone
 -/
 import Lake.Toml
 import Lake.Util.Message
-import Lake.Util.Newline
 
 /-!
 ## TOML Test Runner
@@ -30,7 +29,7 @@ nonrec def loadToml (tomlFile : FilePath) : BaseIO TomlOutcome := do
     match (← IO.FS.readBinFile tomlFile |>.toBaseIO) with
     | .ok bytes =>
       if let some input := String.fromUTF8? bytes then
-        pure (crlf2lf input)
+        pure input.crlfToLf
       else
         return .fail <| MessageLog.empty.add
           {fileName, pos := ⟨1,0⟩, data := m!"file contains invalid characters"}
diff --git a/tests/lean/run/crlfToLf.lean b/tests/lean/run/crlfToLf.lean
@@ -0,0 +1,39 @@
+/-!
+# Test `String.crlfToLf`
+-/
+
+/-!
+Leaves single `\n`'s alone.
+-/
+/-- info: "hello\nworld" -/
+#guard_msgs in #eval String.crlfToLf "hello\nworld"
+
+/-!
+Turns `\r\n` into `\n`.
+-/
+/-- info: "hello\nworld" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\nworld"
+
+/-!
+In a string of `\r...\r\n`, only normalizes the last `\r\n`.
+-/
+/-- info: "hello\x0d\nworld" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\r\nworld"
+
+/-!
+Two in a row.
+-/
+/-- info: "hello\n\nworld" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\n\r\nworld"
+
+/-!
+Normalizes at the end.
+-/
+/-- info: "hello\nworld\n" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r\n"
+
+/-!
+Can handle a loose `\r` as the last character.
+-/
+/-- info: "hello\nworld\x0d" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r"
diff --git a/tests/lean/run/string_gaps.lean b/tests/lean/run/string_gaps.lean
diff --git a/tests/lean/string_gaps.lean.expected.out b/tests/lean/string_gaps.lean.expected.out