Skip to content

Commit a7338c5

Browse files
authored
feat: make frontend normalize line endings to LF (#3903)
To eliminate parsing differences between Windows and other platforms, the frontend now normalizes all CRLF line endings to LF, like [in Rust](rust-lang/rust#62865). Effects: - This makes Lake hashes be faithful to what Lean sees (Lake already normalizes line endings before computing hashes). - Docstrings now have normalized line endings. In particular, this fixes `#guard_msgs` failing multiline tests for Windows users using CRLF. - Now strings don't have different lengths depending on the platform. Before this PR, the following theorem is true for LF and false for CRLF files. ```lean example : " ".length = 1 := rfl ``` Note: the normalization will take `\r\r\n` and turn it into `\r\n`. In the elaborator, we reject loose `\r`'s that appear in whitespace. Rust instead takes the approach of making the normalization routine fail. They do this so that there's no downstream confusion about any `\r\n` that appears. Implementation note: the LSP maintains its own copy of a source file that it updates when edit operations are applied. We are assuming that edit operations never split or join CRLFs. If this assumption is not correct, then the LSP copy of a source file can become slightly out of sync. If this is an issue, there is some discussion [here](#3903 (comment)).
1 parent b278f9d commit a7338c5

File tree

13 files changed

+137
-88
lines changed

13 files changed

+137
-88
lines changed

src/Init/Data/String/Extra.lean

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,4 +198,35 @@ def removeLeadingSpaces (s : String) : String :=
198198
let n := findLeadingSpacesSize s
199199
if n == 0 then s else removeNumLeadingSpaces n s
200200

201+
/--
202+
Replaces each `\r\n` with `\n` to normalize line endings,
203+
but does not validate that there are no isolated `\r` characters.
204+
It is an optimized version of `String.replace text "\r\n" "\n"`.
205+
-/
206+
def crlfToLf (text : String) : String :=
207+
go "" 0 0
208+
where
209+
go (acc : String) (accStop pos : String.Pos) : String :=
210+
if h : text.atEnd pos then
211+
-- note: if accStop = 0 then acc is empty
212+
if accStop = 0 then text else acc ++ text.extract accStop pos
213+
else
214+
let c := text.get' pos h
215+
let pos' := text.next' pos h
216+
if h' : ¬ text.atEnd pos' ∧ c == '\r' ∧ text.get pos' == '\n' then
217+
let acc := acc ++ text.extract accStop pos
218+
go acc pos' (text.next' pos' h'.1)
219+
else
220+
go acc accStop pos'
221+
termination_by text.utf8ByteSize - pos.byteIdx
222+
decreasing_by
223+
decreasing_with
224+
show text.utf8ByteSize - (text.next' (text.next' pos _) _).byteIdx < text.utf8ByteSize - pos.byteIdx
225+
have k := Nat.gt_of_not_le <| mt decide_eq_true h
226+
exact Nat.sub_lt_sub_left k (Nat.lt_trans (String.lt_next text pos) (String.lt_next _ _))
227+
decreasing_with
228+
show text.utf8ByteSize - (text.next' pos _).byteIdx < text.utf8ByteSize - pos.byteIdx
229+
have k := Nat.gt_of_not_le <| mt decide_eq_true h
230+
exact Nat.sub_lt_sub_left k (String.lt_next _ _)
231+
201232
end String

src/Lean/Data/Position.lean

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def ofPosition (text : FileMap) (pos : Position) : String.Pos :=
106106

107107
/--
108108
Returns the position of the start of (1-based) line `line`.
109-
This gives the stame result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
109+
This gives the same result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
110110
-/
111111
def lineStart (map : FileMap) (line : Nat) : String.Pos :=
112112
if h : line - 1 < map.positions.size then

src/Lean/Parser/Basic.lean

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,8 @@ partial def whitespace : ParserFn := fun c s =>
558558
let curr := input.get' i h
559559
if curr == '\t' then
560560
s.mkUnexpectedError (pushMissing := false) "tabs are not allowed; please configure your editor to expand them"
561+
else if curr == '\r' then
562+
s.mkUnexpectedError (pushMissing := false) "isolated carriage returns are not allowed"
561563
else if curr.isWhitespace then whitespace c (s.next' input i h)
562564
else if curr == '-' then
563565
let i := input.next' i h
@@ -621,9 +623,8 @@ def hexDigitFn : ParserFn := fun c s =>
621623
/--
622624
Parses the whitespace after the `\` when there is a string gap.
623625
Raises an error if the whitespace does not contain exactly one newline character.
624-
Processes `\r\n` as a newline.
625626
-/
626-
partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
627+
partial def stringGapFn (seenNewline : Bool) : ParserFn := fun c s =>
627628
let i := s.pos
628629
if h : c.input.atEnd i then s -- let strLitFnAux handle the EOI error if !seenNewline
629630
else
@@ -633,13 +634,9 @@ partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
633634
-- Having more than one newline in a string gap is visually confusing
634635
s.mkUnexpectedError "unexpected additional newline in string gap"
635636
else
636-
stringGapFn true false c (s.next' c.input i h)
637-
else if curr == '\r' then
638-
stringGapFn seenNewline true c (s.next' c.input i h)
639-
else if afterCR then
640-
s.mkUnexpectedError "expecting newline after carriage return"
637+
stringGapFn true c (s.next' c.input i h)
641638
else if curr.isWhitespace then
642-
stringGapFn seenNewline false c (s.next' c.input i h)
639+
stringGapFn seenNewline c (s.next' c.input i h)
643640
else if seenNewline then
644641
s
645642
else
@@ -663,8 +660,8 @@ def quotedCharCoreFn (isQuotable : Char → Bool) (inString : Bool) : ParserFn :
663660
andthenFn hexDigitFn hexDigitFn c (s.next' input i h)
664661
else if curr == 'u' then
665662
andthenFn hexDigitFn (andthenFn hexDigitFn (andthenFn hexDigitFn hexDigitFn)) c (s.next' input i h)
666-
else if inString && (curr == '\n' || curr == '\r') then
667-
stringGapFn false false c s
663+
else if inString && curr == '\n' then
664+
stringGapFn false c s
668665
else
669666
s.mkUnexpectedError "invalid escape sequence"
670667

src/Lean/Parser/Extension.lean

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -437,11 +437,12 @@ def getSyntaxNodeKinds (env : Environment) : List SyntaxNodeKind :=
437437
def getTokenTable (env : Environment) : TokenTable :=
438438
(parserExtension.getState env).tokens
439439

440-
def mkInputContext (input : String) (fileName : String) : InputContext := {
441-
input := input,
442-
fileName := fileName,
443-
fileMap := input.toFileMap
444-
}
440+
-- Note: `crlfToLf` preserves logical line and column numbers for each character.
441+
def mkInputContext (input : String) (fileName : String) (normalizeLineEndings := true) : InputContext :=
442+
let input' := if normalizeLineEndings then input.crlfToLf else input
443+
{ input := input',
444+
fileName := fileName,
445+
fileMap := input'.toFileMap }
445446

446447
def mkParserState (input : String) : ParserState :=
447448
{ cache := initCacheForInput input }
@@ -453,7 +454,7 @@ def runParserCategory (env : Environment) (catName : Name) (input : String) (fil
453454
let s := p.run ictx { env, options := {} } (getTokenTable env) (mkParserState input)
454455
if !s.allErrors.isEmpty then
455456
Except.error (s.toErrorMsg ictx)
456-
else if input.atEnd s.pos then
457+
else if ictx.input.atEnd s.pos then
457458
Except.ok s.stxStack.back
458459
else
459460
Except.error ((s.mkError "end of input").toErrorMsg ictx)

src/Lean/Server/FileWorker.lean

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -705,12 +705,9 @@ def initAndRunWorker (i o e : FS.Stream) (opts : Options) : IO UInt32 := do
705705
let initParams ← i.readLspRequestAs "initialize" InitializeParams
706706
let ⟨_, param⟩ ← i.readLspNotificationAs "textDocument/didOpen" LeanDidOpenTextDocumentParams
707707
let doc := param.textDocument
708-
/- NOTE(WN): `toFileMap` marks line beginnings as immediately following
709-
"\n", which should be enough to handle both LF and CRLF correctly.
710-
This is because LSP always refers to characters by (line, column),
711-
so if we get the line number correct it shouldn't matter that there
712-
is a CR there. -/
713-
let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.toFileMap, param.dependencyBuildMode?.getD .always⟩
708+
/- Note (kmill): LSP always refers to characters by (line, column),
709+
so converting CRLF to LF preserves line and column numbers. -/
710+
let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, param.dependencyBuildMode?.getD .always⟩
714711
let e := e.withPrefix s!"[{param.textDocument.uri}] "
715712
let _ ← IO.setStderr e
716713
let (ctx, st) ← try

src/Lean/Server/Utils.lean

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,10 @@ structure DocumentMeta where
7575
uri : Lsp.DocumentUri
7676
/-- Version number of the document. Incremented whenever the document is edited. -/
7777
version : Nat
78-
/-- Current text of the document. -/
78+
/--
79+
Current text of the document.
80+
It is maintained such that it is normalized using `String.crlfToLf`, which preserves logical line/column numbers.
81+
(Note: we assume that edit operations never split or merge `\r\n` line endings.) -/
7982
text : FileMap
8083
/--
8184
Controls when dependencies of the document are built on `textDocument/didOpen` notifications.
@@ -98,7 +101,11 @@ def replaceLspRange (text : FileMap) (r : Lsp.Range) (newText : String) : FileMa
98101
let «end» := text.lspPosToUtf8Pos r.«end»
99102
let pre := text.source.extract 0 start
100103
let post := text.source.extract «end» text.source.endPos
101-
(pre ++ newText ++ post).toFileMap
104+
-- `pre` and `post` already have normalized line endings, so only `newText` needs its endings normalized.
105+
-- Note: this assumes that editing never separates a `\r\n`.
106+
-- If `pre` ends with `\r` and `newText` begins with `\n`, the result is potentially inaccurate.
107+
-- If this is ever a problem, we could store a second unnormalized FileMap, edit it, and normalize it here.
108+
(pre ++ newText.crlfToLf ++ post).toFileMap
102109

103110
open IO
104111

@@ -125,7 +132,7 @@ def applyDocumentChange (oldText : FileMap) : (change : Lsp.TextDocumentContentC
125132
| TextDocumentContentChangeEvent.rangeChange (range : Range) (newText : String) =>
126133
replaceLspRange oldText range newText
127134
| TextDocumentContentChangeEvent.fullChange (newText : String) =>
128-
newText.toFileMap
135+
newText.crlfToLf.toFileMap
129136

130137
/-- Returns the document contents with all changes applied. -/
131138
def foldDocumentChanges (changes : Array Lsp.TextDocumentContentChangeEvent) (oldText : FileMap) : FileMap :=

src/Lean/Server/Watchdog.lean

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -728,12 +728,9 @@ end RequestHandling
728728
section NotificationHandling
729729
def handleDidOpen (p : LeanDidOpenTextDocumentParams) : ServerM Unit :=
730730
let doc := p.textDocument
731-
/- NOTE(WN): `toFileMap` marks line beginnings as immediately following
732-
"\n", which should be enough to handle both LF and CRLF correctly.
733-
This is because LSP always refers to characters by (line, column),
734-
so if we get the line number correct it shouldn't matter that there
735-
is a CR there. -/
736-
startFileWorker ⟨doc.uri, doc.version, doc.text.toFileMap, p.dependencyBuildMode?.getD .always⟩
731+
/- Note (kmill): LSP always refers to characters by (line, column),
732+
so converting CRLF to LF preserves line and column numbers. -/
733+
startFileWorker ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, p.dependencyBuildMode?.getD .always⟩
737734

738735
def handleDidChange (p : DidChangeTextDocumentParams) : ServerM Unit := do
739736
let doc := p.textDocument

src/lake/Lake/Build/Trace.lean

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ Copyright (c) 2021 Mac Malone. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
Authors: Mac Malone
55
-/
6-
import Lake.Util.Newline
76

87
open System
98
namespace Lake
@@ -135,7 +134,7 @@ instance : ComputeHash FilePath IO := ⟨computeFileHash⟩
135134

136135
def computeTextFileHash (file : FilePath) : IO Hash := do
137136
let text ← IO.FS.readFile file
138-
let text := crlf2lf text
137+
let text := text.crlfToLf
139138
return Hash.ofString text
140139

141140
/--

src/lake/Lake/Util/Newline.lean

Lines changed: 0 additions & 24 deletions
This file was deleted.

src/lake/tests/toml/Test.lean

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ Authors: Mac Malone
55
-/
66
import Lake.Toml
77
import Lake.Util.Message
8-
import Lake.Util.Newline
98

109
/-!
1110
## TOML Test Runner
@@ -30,7 +29,7 @@ nonrec def loadToml (tomlFile : FilePath) : BaseIO TomlOutcome := do
3029
match (← IO.FS.readBinFile tomlFile |>.toBaseIO) with
3130
| .ok bytes =>
3231
if let some input := String.fromUTF8? bytes then
33-
pure (crlf2lf input)
32+
pure input.crlfToLf
3433
else
3534
return .fail <| MessageLog.empty.add
3635
{fileName, pos := ⟨1,0⟩, data := m!"file contains invalid characters"}

tests/lean/run/crlfToLf.lean

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/-!
2+
# Test `String.crlfToLf`
3+
-/
4+
5+
/-!
6+
Leaves single `\n`'s alone.
7+
-/
8+
/-- info: "hello\nworld" -/
9+
#guard_msgs in #eval String.crlfToLf "hello\nworld"
10+
11+
/-!
12+
Turns `\r\n` into `\n`.
13+
-/
14+
/-- info: "hello\nworld" -/
15+
#guard_msgs in #eval String.crlfToLf "hello\r\nworld"
16+
17+
/-!
18+
In a string of `\r...\r\n`, only normalizes the last `\r\n`.
19+
-/
20+
/-- info: "hello\x0d\nworld" -/
21+
#guard_msgs in #eval String.crlfToLf "hello\r\r\nworld"
22+
23+
/-!
24+
Two in a row.
25+
-/
26+
/-- info: "hello\n\nworld" -/
27+
#guard_msgs in #eval String.crlfToLf "hello\r\n\r\nworld"
28+
29+
/-!
30+
Normalizes at the end.
31+
-/
32+
/-- info: "hello\nworld\n" -/
33+
#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r\n"
34+
35+
/-!
36+
Can handle a loose `\r` as the last character.
37+
-/
38+
/-- info: "hello\nworld\x0d" -/
39+
#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r"

0 commit comments

Comments
 (0)