Skip to content

Commit 8d43e44

Browse files
committed
auto merge of #15867 : cmr/rust/rewrite-lexer4, r=alexcrichton
2 parents 32f4d99 + 95a1ce6 commit 8d43e44

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+735
-11
lines changed

Makefile.in

+1
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ ifneq ($(strip $(findstring check,$(MAKECMDGOALS)) \
216216
$(findstring tidy,$(MAKECMDGOALS))),)
217217
CFG_INFO := $(info cfg: including test rules)
218218
include $(CFG_SRC_DIR)mk/tests.mk
219+
include $(CFG_SRC_DIR)mk/grammar.mk
219220
endif
220221

221222
# Performance and benchmarking

configure

+3
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,9 @@ probe CFG_VALGRIND valgrind
493493
probe CFG_PERF perf
494494
probe CFG_ISCC iscc
495495
probe CFG_LLNEXTGEN LLnextgen
496+
probe CFG_JAVAC javac
497+
probe CFG_ANTLR4 antlr4
498+
probe CFG_GRUN grun
496499
probe CFG_PANDOC pandoc
497500
probe CFG_PDFLATEX pdflatex
498501
probe CFG_XELATEX xelatex

mk/grammar.mk

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2+
# file at the top-level directory of this distribution and at
3+
# http://rust-lang.org/COPYRIGHT.
4+
#
5+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
# option. This file may not be copied, modified, or distributed
9+
# except according to those terms.
10+
11+
BG = $(CFG_BUILD_DIR)/grammar/
12+
SG = $(S)src/grammar/
13+
B = $(CFG_BUILD_DIR)/$(CFG_BUILD)/stage2/
14+
L = $(B)lib/rustlib/$(CFG_BUILD)/lib
15+
LD = $(CFG_BUILD)/stage2/lib/rustlib/$(CFG_BUILD)/lib/
16+
RUSTC = $(B)bin/rustc
17+
18+
# Run the reference lexer against libsyntax and compare the tokens and spans.
19+
# If "// ignore-lexer-test" is present in the file, it will be ignored.
20+
#
21+
# $(1) is the file to test.
22+
define LEXER_TEST
23+
grep "// ignore-lexer-test" $(1) ; \
24+
if [ $$? -eq 1 ]; then \
25+
CLASSPATH=$(B)grammar $(CFG_GRUN) RustLexer tokens -tokens < $(1) \
26+
| $(B)grammar/verify $(1) ; \
27+
fi
28+
endef
29+
30+
$(BG):
31+
$(Q)mkdir -p $(BG)
32+
33+
$(BG)RustLexer.class: $(SG)RustLexer.g4
34+
$(Q)$(CFG_ANTLR4) -o $(B)grammar $(SG)RustLexer.g4
35+
$(Q)$(CFG_JAVAC) -d $(BG) $(BG)RustLexer.java
36+
37+
$(BG)verify: $(SG)verify.rs rustc-stage2-H-$(CFG_BUILD) $(LD)stamp.regex_macros $(LD)stamp.rustc
38+
$(Q)$(RUSTC) -O --out-dir $(BG) -L $(L) $(SG)verify.rs
39+
40+
check-lexer: $(BG) $(BG)RustLexer.class $(BG)verify
41+
ifdef CFG_JAVAC
42+
ifdef CFG_ANTLR4
43+
ifdef CFG_GRUN
44+
$(info Verifying libsyntax against the reference lexer ...)
45+
$(Q)$(SG)check.sh $(S) "$(BG)" \
46+
"$(CFG_GRUN)" "$(BG)verify" "$(BG)RustLexer.tokens"
47+
else
48+
$(info grun not available, skipping lexer test...)
49+
endif
50+
else
51+
$(info antlr4 not available, skipping lexer test...)
52+
endif
53+
else
54+
$(info javac not available, skipping lexer test...)
55+
endif

mk/tests.mk

+2
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,8 @@ check-docs: cleantestlibs cleantmptestlogs check-stage2-docs
192192
# NOTE: Remove after reprogramming windows bots
193193
check-fast: check-lite
194194

195+
check-syntax: check-lexer
196+
195197
.PHONY: cleantmptestlogs cleantestlibs
196198

197199
cleantmptestlogs:

src/grammar/.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
verify
2+
*.class
3+
*.java
4+
*.tokens

src/grammar/README.md

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
Reference grammar.
2+
3+
Uses [antlr4](http://www.antlr.org/) and a custom Rust tool to compare
4+
ASTs/token streams generated. You can use the `check-syntax` make target to
5+
run all of the available tests.
6+
7+
To use manually:
8+
9+
```
10+
antlr4 RustLexer.g4
11+
javac *.java
12+
rustc -O verify.rs
13+
for file in ../*/**.rs; do
14+
echo $file;
15+
grun RustLexer tokens -tokens < $file | ./verify $file || break
16+
done
17+
```
18+
19+
Note That the `../*/**.rs` glob will match every `*.rs` file in the above
20+
directory and all of its recursive children. This is a zsh extension.

src/grammar/RustLexer.g4

+170
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
lexer grammar RustLexer;
2+
3+
tokens {
4+
EQ, LT, LE, EQEQ, NE, GE, GT, ANDAND, OROR, NOT, TILDE, PLUT,
5+
MINUS, STAR, SLASH, PERCENT, CARET, AND, OR, SHL, SHR, BINOP,
6+
BINOPEQ, AT, DOT, DOTDOT, DOTDOTDOT, COMMA, SEMI, COLON,
7+
MOD_SEP, RARROW, FAT_ARROW, LPAREN, RPAREN, LBRACKET, RBRACKET,
8+
LBRACE, RBRACE, POUND, DOLLAR, UNDERSCORE, LIT_CHAR,
9+
LIT_INTEGER, LIT_FLOAT, LIT_STR, LIT_STR_RAW, LIT_BINARY,
10+
LIT_BINARY_RAW, IDENT, LIFETIME, WHITESPACE, DOC_COMMENT,
11+
COMMENT
12+
}
13+
14+
/* Note: due to antlr limitations, we can't represent XID_start and
15+
* XID_continue properly. ASCII-only substitute. */
16+
17+
fragment XID_start : [_a-zA-Z] ;
18+
fragment XID_continue : [_a-zA-Z0-9] ;
19+
20+
21+
/* Expression-operator symbols */
22+
23+
EQ : '=' ;
24+
LT : '<' ;
25+
LE : '<=' ;
26+
EQEQ : '==' ;
27+
NE : '!=' ;
28+
GE : '>=' ;
29+
GT : '>' ;
30+
ANDAND : '&&' ;
31+
OROR : '||' ;
32+
NOT : '!' ;
33+
TILDE : '~' ;
34+
PLUS : '+' ;
35+
MINUS : '-' ;
36+
STAR : '*' ;
37+
SLASH : '/' ;
38+
PERCENT : '%' ;
39+
CARET : '^' ;
40+
AND : '&' ;
41+
OR : '|' ;
42+
SHL : '<<' ;
43+
SHR : '>>' ;
44+
45+
BINOP
46+
: PLUS
47+
| SLASH
48+
| MINUS
49+
| STAR
50+
| PERCENT
51+
| CARET
52+
| AND
53+
| OR
54+
| SHL
55+
| SHR
56+
;
57+
58+
BINOPEQ : BINOP EQ ;
59+
60+
/* "Structural symbols" */
61+
62+
AT : '@' ;
63+
DOT : '.' ;
64+
DOTDOT : '..' ;
65+
DOTDOTDOT : '...' ;
66+
COMMA : ',' ;
67+
SEMI : ';' ;
68+
COLON : ':' ;
69+
MOD_SEP : '::' ;
70+
RARROW : '->' ;
71+
FAT_ARROW : '=>' ;
72+
LPAREN : '(' ;
73+
RPAREN : ')' ;
74+
LBRACKET : '[' ;
75+
RBRACKET : ']' ;
76+
LBRACE : '{' ;
77+
RBRACE : '}' ;
78+
POUND : '#';
79+
DOLLAR : '$' ;
80+
UNDERSCORE : '_' ;
81+
82+
// Literals
83+
84+
fragment HEXIT
85+
: [0-9a-fA-F]
86+
;
87+
88+
fragment CHAR_ESCAPE
89+
: [nrt\\'"0]
90+
| [xX] HEXIT HEXIT
91+
| 'u' HEXIT HEXIT HEXIT HEXIT
92+
| 'U' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
93+
;
94+
95+
LIT_CHAR
96+
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\''
97+
;
98+
99+
LIT_BYTE
100+
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT | [nrt\\'"0] ) | ~[\\'\n\t\r] ) '\''
101+
;
102+
103+
fragment INT_SUFFIX
104+
: 'i'
105+
| 'i8'
106+
| 'i16'
107+
| 'i32'
108+
| 'i64'
109+
| 'u'
110+
| 'u8'
111+
| 'u16'
112+
| 'u32'
113+
| 'u64'
114+
;
115+
116+
LIT_INTEGER
117+
: [0-9][0-9_]* INT_SUFFIX?
118+
| '0b' [01][01_]* INT_SUFFIX?
119+
| '0o' [0-7][0-7_]* INT_SUFFIX?
120+
| '0x' [0-9a-fA-F][0-9a-fA-F_]* INT_SUFFIX?
121+
;
122+
123+
FLOAT_SUFFIX
124+
: 'f32'
125+
| 'f64'
126+
| 'f128'
127+
;
128+
129+
LIT_FLOAT
130+
: [0-9][0-9_]* ('.' | ('.' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? FLOAT_SUFFIX?)
131+
;
132+
133+
LIT_STR
134+
: '"' ('\\\n' | '\\\r\n' | '\\' CHAR_ESCAPE | .)*? '"'
135+
;
136+
137+
LIT_BINARY : 'b' LIT_STR ;
138+
LIT_BINARY_RAW : 'rb' LIT_STR_RAW ;
139+
140+
/* this is a bit messy */
141+
142+
fragment LIT_STR_RAW_INNER
143+
: '"' .*? '"'
144+
| LIT_STR_RAW_INNER2
145+
;
146+
147+
fragment LIT_STR_RAW_INNER2
148+
: POUND LIT_STR_RAW_INNER POUND
149+
;
150+
151+
LIT_STR_RAW
152+
: 'r' LIT_STR_RAW_INNER
153+
;
154+
155+
IDENT : XID_start XID_continue* ;
156+
157+
LIFETIME : '\'' IDENT ;
158+
159+
WHITESPACE : [ \r\n\t]+ ;
160+
161+
UNDOC_COMMENT : '////' ~[\r\n]* -> type(COMMENT) ;
162+
YESDOC_COMMENT : '///' ~[\r\n]* -> type(DOC_COMMENT) ;
163+
OUTER_DOC_COMMENT : '//!' ~[\r\n]* -> type(DOC_COMMENT) ;
164+
LINE_COMMENT : '//' ~[\r\n]* -> type(COMMENT) ;
165+
166+
DOC_BLOCK_COMMENT
167+
: ('/**' ~[*] | '/*!') (DOC_BLOCK_COMMENT | .)*? '*/' -> type(DOC_COMMENT)
168+
;
169+
170+
BLOCK_COMMENT : '/*' (BLOCK_COMMENT | .)*? '*/' -> type(COMMENT) ;

src/grammar/check.sh

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/sh
2+
3+
# Run the reference lexer against libsyntax and compare the tokens and spans.
4+
# If "// ignore-lexer-test" is present in the file, it will be ignored.
5+
6+
7+
# Argument $1 is the file to check, $2 is the classpath to use, $3 is the path
8+
# to the grun binary, $4 is the path to the verify binary, $5 is the path to
9+
# RustLexer.tokens
10+
if [ "${VERBOSE}" == "1" ]; then
11+
set -x
12+
fi
13+
14+
check() {
15+
grep --silent "// ignore-lexer-test" $1;
16+
17+
# if it's *not* found...
18+
if [ $? -eq 1 ]; then
19+
cd $2 # This `cd` is so java will pick up RustLexer.class. I couldn't
20+
# figure out how to wrangle the CLASSPATH, just adding build/grammr didn't
21+
# seem to have anny effect.
22+
if $3 RustLexer tokens -tokens < $1 | $4 $1 $5; then
23+
echo "pass: $1"
24+
else
25+
echo "fail: $1"
26+
fi
27+
else
28+
echo "skip: $1"
29+
fi
30+
}
31+
32+
for file in $(find $1 -iname '*.rs' ! -path '*/test/compile-fail*'); do
33+
check $file $2 $3 $4 $5
34+
done
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
Rust's lexical grammar is not context-free. Raw string literals are the source
2+
of the problem. Informally, a raw string literal is an `r`, followed by `N`
3+
hashes (where N can be zero), a quote, any characters, then a quote followed
4+
by `N` hashes. This grammar describes this as best possible:
5+
6+
R -> 'r' S
7+
S -> '"' B '"'
8+
S -> '#' S '#'
9+
B -> . B
10+
B -> ε
11+
12+
Where `.` represents any character, and `ε` the empty string. Consider the
13+
string `r#""#"#`. This string is not a valid raw string literal, but can be
14+
accepted as one by the above grammar, using the derivation:
15+
16+
R : #""#"#
17+
S : ""#"
18+
S : "#
19+
B : #
20+
B : ε
21+
22+
(Where `T : U` means the rule `T` is applied, and `U` is the remainder of the
23+
string.) The difficulty arises from the fact that it is fundamentally
24+
context-sensitive. In particular, the context needed is the number of hashes.
25+
I know of no way to resolve this, but also have not come up with a proof that
26+
it is not context sensitive. Such a proof would probably use the pumping lemma
27+
for context-free languages, but I (cmr) could not come up with a proof after
28+
spending a few hours on it, and decided my time best spent elsewhere. Pull
29+
request welcome!

0 commit comments

Comments
 (0)