Skip to content

Commit ce0c4f6

Browse files
committed
Auto merge of #66670 - crlf0710:normalize_ident, r=<try>
Normalize ident Perform unicode normalization on identifiers. Resolving the first bullet point in #55467.
2 parents a44774c + 0e3036b commit ce0c4f6

File tree

5 files changed

+31
-5
lines changed

5 files changed

+31
-5
lines changed

Cargo.lock

+6-2
Original file line numberDiff line numberDiff line change
@@ -3813,6 +3813,7 @@ dependencies = [
38133813
"smallvec 1.0.0",
38143814
"syntax",
38153815
"syntax_pos",
3816+
"unicode-normalization",
38163817
]
38173818

38183819
[[package]]
@@ -5006,9 +5007,12 @@ dependencies = [
50065007

50075008
[[package]]
50085009
name = "unicode-normalization"
5009-
version = "0.1.7"
5010+
version = "0.1.11"
50105011
source = "registry+https://github.com/rust-lang/crates.io-index"
5011-
checksum = "6a0180bc61fc5a987082bfa111f4cc95c4caff7f9799f3e46df09163a937aa25"
5012+
checksum = "b561e267b2326bb4cebfc0ef9e68355c7abe6c6f522aeac2f5bf95d56c59bdcf"
5013+
dependencies = [
5014+
"smallvec 1.0.0",
5015+
]
50125016

50135017
[[package]]
50145018
name = "unicode-segmentation"

src/librustc_parse/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@ rustc_lexer = { path = "../librustc_lexer" }
2020
rustc_target = { path = "../librustc_target" }
2121
smallvec = { version = "1.0", features = ["union", "may_dangle"] }
2222
rustc_error_codes = { path = "../librustc_error_codes" }
23+
unicode-normalization = "0.1.11"

src/librustc_parse/lexer/mod.rs

+15-2
Original file line numberDiff line numberDiff line change
@@ -220,8 +220,7 @@ impl<'a> StringReader<'a> {
220220
if is_raw_ident {
221221
ident_start = ident_start + BytePos(2);
222222
}
223-
// FIXME: perform NFKC normalization here. (Issue #2253)
224-
let sym = self.symbol_from(ident_start);
223+
let sym = self.nfc_symbol_from(ident_start);
225224
if is_raw_ident {
226225
let span = self.mk_sp(start, self.pos);
227226
if !sym.can_be_raw() {
@@ -466,6 +465,20 @@ impl<'a> StringReader<'a> {
466465
Symbol::intern(self.str_from_to(start, end))
467466
}
468467

468+
/// As symbol_from, with the text normalized into Unicode NFC form.
469+
fn nfc_symbol_from(&self, start: BytePos) -> Symbol {
470+
use unicode_normalization::{is_nfc_quick, IsNormalized, UnicodeNormalization};
471+
debug!("taking an normalized ident from {:?} to {:?}", start, self.pos);
472+
let sym = self.str_from(start);
473+
match is_nfc_quick(sym.chars()) {
474+
IsNormalized::Yes => Symbol::intern(sym),
475+
_ => {
476+
let sym_str: String = sym.chars().nfc().collect();
477+
Symbol::intern(&sym_str)
478+
}
479+
}
480+
}
481+
469482
/// Slice of the source text spanning from `start` up to but excluding `end`.
470483
fn str_from_to(&self, start: BytePos, end: BytePos) -> &str
471484
{

src/test/ui/codemap_tests/unicode_2.stderr

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ LL | let _ = ("아あ", 1i42);
1414
|
1515
= help: valid widths are 8, 16, 32, 64 and 128
1616

17-
error[E0425]: cannot find value `a̐é` in this scope
17+
error[E0425]: cannot find value `a̐é` in this scope
1818
--> $DIR/unicode_2.rs:6:13
1919
|
2020
LL | let _ = a̐é;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
// check-pass
2+
#![feature(non_ascii_idents)]
3+
4+
struct Résumé; // ['LATIN SMALL LETTER E WITH ACUTE']
5+
6+
fn main() {
7+
let _ = Résumé; // ['LATIN SMALL LETTER E', 'COMBINING ACUTE ACCENT']
8+
}

0 commit comments

Comments
 (0)