-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathUTF8Str.lean
208 lines (181 loc) · 8.97 KB
/
UTF8Str.lean
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
-- Copyright Kani Contributors
-- SPDX-License-Identifier: Apache-2.0 OR MIT
import RustLeanModels.Basic
import RustLeanModels.RustString
import Lean
open Char
open List
open Mathlib
open RustString
open Nat
set_option maxHeartbeats 10000000
--TODO: prove the axiom from utf8EncodeChar function in Data/String/Extra.lean
opaque toByte (c: Char) (i: Nat) (gi: i < c.utf8Size) : Nat
/-
The following three axioms are stated according to the description of UTF8 encoding
Source: https://en.wikipedia.org/wiki/UTF-8#:~:text=UTF%2D8%20is%20a%20variable,Unicode%20Standard
Axiom "Char_firstbyte_ne_others" is based on the fact that only the first bytes of all characters
start with '0' or '11', while all other bytes start with '10'. In other words, the values of the
first bytes of all characters are outside of the range [128, 191], while the value of other bytes
are inside of the range.
Axiom "Char_size_eq_of_firstbyte_eq" is based on the fact that the number of bytes used to encode
a character is determined by its first byte. Specifically, if the first byte starts with '0',
'11', '111', '1111', then the number of bytes is 1, 2, 3, 4 accordingly.
Axiom "exists_byte_ne_of_Chat_ne" is based on the fact that if two characters are not equal, then
at some (byte) position in their encoding, the two corresponding bytes are different.
-/
axiom Char_firstbyte_ne_others {c1 c2: Char} {i: Nat} {gi: i < c2.utf8Size}:
i ≠ 0 → toByte c1 0 (by linarith [@utf8Size_pos c1]) ≠ toByte c2 i gi
axiom Char_size_eq_of_firstbyte_eq {c1 c2: Char} : toByte c1 0 (by linarith [@utf8Size_pos c1]) = toByte c2 0 (by linarith [@utf8Size_pos c2])
→ utf8Size c1 = utf8Size c2
axiom exists_byte_ne_of_Chat_ne {c1 c2: Char} :
c1 ≠ c2 → ∃ i g1 g2, toByte c1 i g1 ≠ toByte c2 i g2
/-
The Char_toUTF8 function converts a Char into it UTF8 encoding.
Char_toUTF8 is the same as String.utf8EncodeChar, but it is defined based on the opaque function toByte,
so we can perform reasoning on it through the axioms.
-/
def Char_toUTF8_aux (c : Char) (i : Nat) (_: i ≤ c.utf8Size): List Nat := match i with
| 0 => []
| succ n => have gh: (utf8Size c - succ n) < utf8Size c := by omega
have gt: n ≤ utf8Size c :=by omega
toByte c (utf8Size c - succ n) gh :: (Char_toUTF8_aux c n gt)
def Char_toUTF8 (c : Char) := Char_toUTF8_aux c (utf8Size c) (by omega)
/- Convert a string s into it UTF8 encoding-/
def Str_toUTF8 (s: Str) : List Nat := match s with
| [] => []
| h::t => Char_toUTF8 h ++ Str_toUTF8 t
lemma Char_toUTF8_aux_ne_nil (g: i > 0) : Char_toUTF8_aux c i gi ≠ [] := by
induction i ;contradiction
rename_i n ind
unfold Char_toUTF8_aux
cases n ; simp only [zero_eq, reduceSucc, ne_eq, not_false_eq_true, not_sym]
simp only [gt_iff_lt, zero_lt_succ, ne_eq, forall_true_left] at ind
simp only [ne_eq, not_false_eq_true, not_sym]
lemma Char_toUTF8_ne_nil : Char_toUTF8 c ≠ [] := by
apply Char_toUTF8_aux_ne_nil; apply utf8Size_pos
lemma Char_toUTF8_aux_length: (Char_toUTF8_aux c i gi).length = i :=by
induction i; simp only [Char_toUTF8_aux, length_nil]
rename_i n ind
unfold Char_toUTF8_aux; simp only [succ_eq_add_one, length_cons, add_left_inj]
cases n; simp only [Char_toUTF8_aux, length_nil]
unfold Char_toUTF8_aux; simp only [succ_eq_add_one, length_cons, add_left_inj]
simp only [Char_toUTF8_aux, succ_eq_add_one, length_cons, add_left_inj] at ind
rename_i n ; have g: n + 1 ≤ c.utf8Size := by omega
exact (@ind g)
lemma Char_toUTF8_length : (Char_toUTF8 c).length = utf8Size c := Char_toUTF8_aux_length
lemma utf8Size_sub_lt {c: Char} {i: Nat}: c.utf8Size - i.succ < c.utf8Size := by
have:= @utf8Size_pos c; omega
lemma all_bytes_eq_of_Char_eq_aux: Char_toUTF8_aux c1 n g1= Char_toUTF8_aux c2 n g2
→ ∀ i < n, toByte c1 (utf8Size c1 - succ i) utf8Size_sub_lt = toByte c2 (utf8Size c2 - succ i) utf8Size_sub_lt :=by
induction n generalizing c1 c2
unfold Char_toUTF8_aux
simp only [zero_eq, not_lt_zero', IsEmpty.forall_iff, forall_const, forall_true_left]
rename_i n ind
intro g x gx
unfold Char_toUTF8_aux at g
have g:= List.cons_eq_cons.mp g
have g1 : n ≤ utf8Size c1 := by omega
have g2 : n ≤ utf8Size c2 := by omega
have ind:= @ind c1 g1 c2 g2 g.right;
by_cases (x < n); rename_i g0
simp only [g0, ind]
have g0: x = n := by linarith
rw[g0]; exact g.left
lemma eq_utf8Size_of_eq_Char_toUTF8: Char_toUTF8 c1 = Char_toUTF8 c2 → c1.utf8Size = c2.utf8Size:= by
unfold Char_toUTF8 Char_toUTF8_aux
have c1p:= @utf8Size_pos c1
have c2p:= @utf8Size_pos c2
split; omega; split; omega
rename_i g1 _ _ _ _ _ g2 _
intro g
simp only [g1, succ_eq_add_one, le_refl, tsub_eq_zero_of_le, g2, cons.injEq] at g
exact Char_size_eq_of_firstbyte_eq g.left
lemma all_bytes_eq_of_Char_eq (g: Char_toUTF8 c1 = Char_toUTF8 c2):
toByte c1 i g1 = toByte c2 i (by linarith [eq_utf8Size_of_eq_Char_toUTF8 g]) := by
have gs:= eq_utf8Size_of_eq_Char_toUTF8 g
unfold Char_toUTF8 at g
simp only [gs] at g
have g:= all_bytes_eq_of_Char_eq_aux g (utf8Size c1 - succ i) (by omega)
have e1: c1.utf8Size - (c1.utf8Size - i.succ).succ = i :=by omega
have e2: c2.utf8Size - (c1.utf8Size - i.succ).succ = i :=by omega
simp only [succ_eq_add_one, e1, e2] at g
exact g
lemma Char_toUTF8_eq_iff_eq: Char_toUTF8 c1 = Char_toUTF8 c2 ↔ c1 = c2 :=by
constructor
intro g; by_contra; rename_i gc
have gc:= exists_byte_ne_of_Chat_ne gc; obtain ⟨i, g1, _, gc⟩:= gc
have gx:= @all_bytes_eq_of_Char_eq c1 c2 i g1 g
contradiction
intro g; rw[g]
lemma char_eq_of_toByteList_prefix : List.IsPrefix (Char_toUTF8 c1) (Char_toUTF8 c2) → c1 = c2 := by
intro g
have g2: utf8Size c1 = utf8Size c2 := by
unfold Char_toUTF8 Char_toUTF8_aux at g
have i1: utf8Size c1 ≠ 0 := by apply Nat.ne_of_gt; apply utf8Size_pos
have i2: utf8Size c2 ≠ 0 := by apply Nat.ne_of_gt; apply utf8Size_pos
split at g ; contradiction ; split at g; contradiction
simp only [succ_eq_add_one, le_refl, tsub_eq_zero_of_le, *] at g;
exact Char_size_eq_of_firstbyte_eq (prefix_cons g).left
have l1: (Char_toUTF8 c1).length = utf8Size c1 :=by apply Char_toUTF8_length
have l2: (Char_toUTF8 c2).length = utf8Size c2 :=by apply Char_toUTF8_length
have l3: (Char_toUTF8 c1).length = (Char_toUTF8 c2).length := by omega
have ge : Char_toUTF8 c1 = Char_toUTF8 c2:= by apply prefix_eq_self g l3
exact Char_toUTF8_eq_iff_eq.mp ge
lemma prefix_iff_listByte_prefix : List.IsPrefix (Str_toUTF8 p) (Str_toUTF8 s) ↔ List.IsPrefix p s := by
induction p generalizing s
simp only [Str_toUTF8, _root_.nil_prefix]
rename_i hp tp ind
cases s
simp only [Str_toUTF8, prefix_nil, append_eq_nil, not_false_eq_true, not_sym, iff_false, not_and] ;
have : Char_toUTF8 hp ≠ [] := by apply Char_toUTF8_ne_nil
simp only [this, not_false_eq_true, not_sym, false_implies]
rename_i hs ts ; simp [Str_toUTF8] ;
constructor
intro gc
have go:= prefix_append_or gc
cases go; rename_i go; have go:= char_eq_of_toByteList_prefix go
rw [go]; apply prefix_cons_mpr
rw [go] at gc;
have gc:= (List.prefix_append_right_inj (Char_toUTF8 hs)).mp gc
simp only [ind.mp, gc]
rename_i go; have go:= char_eq_of_toByteList_prefix go
rw [go]; apply prefix_cons_mpr
rw [go] at gc;
have gc:= (List.prefix_append_right_inj (Char_toUTF8 hp)).mp gc
simp only [ind.mp, gc]
intro g ; have g:= prefix_cons g
rw [g.left];
apply (prefix_append_right_inj (Char_toUTF8 hs)).mpr
simp only [ind.mpr, g.right]
lemma Str_toUTF8_append: Str_toUTF8 (s1 ++ s2) = Str_toUTF8 s1 ++ Str_toUTF8 s2 :=by
induction s1
simp only [nil_append, Str_toUTF8]
rename_i h1 t1 ind
simp only [Str_toUTF8, append_eq, ind, append_assoc]
lemma Str_toUTF8_eq_iff_eq: Str_toUTF8 s1 = Str_toUTF8 s2 ↔ s1 = s2 :=by
constructor
induction s1 generalizing s2
cases s2; simp only [imp_self]
simp only [Str_toUTF8, nil_eq_append, Char_toUTF8_ne_nil, false_and, not_false_eq_true, not_sym, imp_self]
rename_i h1 t1 ind
cases s2
simp only [Str_toUTF8, append_eq_nil, Char_toUTF8_ne_nil, false_and, not_false_eq_true, not_sym, imp_self]
rename_i h2 t2
simp only [Str_toUTF8, cons.injEq]
intro g
have g1: List.IsPrefix (Char_toUTF8 h1) (Char_toUTF8 h1 ++ Str_toUTF8 t1):=by simp only [prefix_append]
have g2: List.IsPrefix (Char_toUTF8 h2) (Char_toUTF8 h1 ++ Str_toUTF8 t1):=by simp only [g, prefix_append]
have gp:= List.prefix_or_prefix_of_prefix g1 g2
have ge: h1 = h2 :=by
cases gp; exact char_eq_of_toByteList_prefix (by assumption); symm; exact char_eq_of_toByteList_prefix (by assumption)
simp only [ge, true_and]; simp only [ge, append_cancel_left_eq] at g
exact ind g
intro g; simp only [g]
lemma Str_toUTF8_length: (Str_toUTF8 s).length = byteSize s:=by
induction s
simp only [Str_toUTF8, length_nil, byteSize]
rename_i ind
simp only [Str_toUTF8, length_append, Char_toUTF8_length, ind, byteSize]
lemma Str_toUTF8_take_prefix: List.IsPrefix p s → (Str_toUTF8 s).take (byteSize p) = Str_toUTF8 p :=by
intro g; rw[← @Str_toUTF8_length p, prefix_eq_take]; exact prefix_iff_listByte_prefix.mpr g