Skip to content

Commit 0eba6f6

Browse files
authored
Add a canonical encoding of identifiers as numbers and use it in clightgen (#353)
Within CompCert, identifiers (names of C functions, variables, types, etc) are represented by unique positive numbers, sometimes called "atoms". In the original implementation, atoms 1, 2, ..., N are assigned to identifiers as they are encountered. The resulting number are small and are efficient when used as keys in data structures such as PTrees. However, the mapping from C source-level identifiers to atoms differs between compilation units. This is not a problem for CompCert but complicates CompCert-based verification tools that need to combine several compilation units. This commit introduces an alternate implementation of atoms, suggested by Andrew Appel. The choice between implementations is governed by the Boolean reference `Camlcoq.use_canonical_atoms`. In the alternate implementation, identifiers are converted to bit sequences via a Huffman encoding, then the bits are represented as positive numbers. The same identifier is always represented by the same number. However, the numbers are usually bigger than in the original implementation, making PTree operations slower: lookups and updates take time linear in the length of the identifier, instead of logarithmic time in the number of identifiers encountered. The CompCert compiler (the `ccomp` executable) still uses the original implementation, but the `clightgen` tool used in conjunction with the VST program logic can use either implementations: - The alternate "canonical atoms" implementation is used by default, and also if the `-canonical-idents` option is given. - The original implementation is used if the `-short-idents` option is given. Closes: #222 Closes: #311
1 parent 4a67662 commit 0eba6f6

File tree

4 files changed

+204
-20
lines changed

4 files changed

+204
-20
lines changed

exportclight/Clightdefs.v

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
(** All imports and definitions used by .v Clight files generated by clightgen *)
1717

18-
From Coq Require Import String List ZArith.
18+
From Coq Require Import Ascii String List ZArith.
1919
From compcert Require Import Integers Floats Maps Errors AST Ctypes Cop Clight.
2020

2121
Definition tvoid := Tvoid.
@@ -80,3 +80,102 @@ Definition mkprogram (types: list composite_definition)
8080
prog_types := types;
8181
prog_comp_env := ce;
8282
prog_comp_env_eq := EQ |}.
83+
84+
(** The following encoding of character strings as positive numbers
85+
must be kept consistent with the OCaml function [Camlcoq.pos_of_string]. *)
86+
87+
Definition append_bit_pos (b: bool) (p: positive) : positive :=
88+
if b then xI p else xO p.
89+
90+
Definition append_char_pos_default (c: ascii) (p: positive) : positive :=
91+
let '(Ascii b7 b6 b5 b4 b3 b2 b1 b0) := c in
92+
xI (xI (xI (xI (xI (xI
93+
(append_bit_pos b0 (append_bit_pos b1
94+
(append_bit_pos b2 (append_bit_pos b3
95+
(append_bit_pos b4 (append_bit_pos b5
96+
(append_bit_pos b6 (append_bit_pos b7 p))))))))))))).
97+
98+
Definition append_char_pos (c: ascii) (p: positive) : positive :=
99+
match c with
100+
| "0"%char => xO (xO (xO (xO (xO (xO p)))))
101+
| "1"%char => xI (xO (xO (xO (xO (xO p)))))
102+
| "2"%char => xO (xI (xO (xO (xO (xO p)))))
103+
| "3"%char => xI (xI (xO (xO (xO (xO p)))))
104+
| "4"%char => xO (xO (xI (xO (xO (xO p)))))
105+
| "5"%char => xI (xO (xI (xO (xO (xO p)))))
106+
| "6"%char => xO (xI (xI (xO (xO (xO p)))))
107+
| "7"%char => xI (xI (xI (xO (xO (xO p)))))
108+
| "8"%char => xO (xO (xO (xI (xO (xO p)))))
109+
| "9"%char => xI (xO (xO (xI (xO (xO p)))))
110+
| "a"%char => xO (xI (xO (xI (xO (xO p)))))
111+
| "b"%char => xI (xI (xO (xI (xO (xO p)))))
112+
| "c"%char => xO (xO (xI (xI (xO (xO p)))))
113+
| "d"%char => xI (xO (xI (xI (xO (xO p)))))
114+
| "e"%char => xO (xI (xI (xI (xO (xO p)))))
115+
| "f"%char => xI (xI (xI (xI (xO (xO p)))))
116+
| "g"%char => xO (xO (xO (xO (xI (xO p)))))
117+
| "h"%char => xI (xO (xO (xO (xI (xO p)))))
118+
| "i"%char => xO (xI (xO (xO (xI (xO p)))))
119+
| "j"%char => xI (xI (xO (xO (xI (xO p)))))
120+
| "k"%char => xO (xO (xI (xO (xI (xO p)))))
121+
| "l"%char => xI (xO (xI (xO (xI (xO p)))))
122+
| "m"%char => xO (xI (xI (xO (xI (xO p)))))
123+
| "n"%char => xI (xI (xI (xO (xI (xO p)))))
124+
| "o"%char => xO (xO (xO (xI (xI (xO p)))))
125+
| "p"%char => xI (xO (xO (xI (xI (xO p)))))
126+
| "q"%char => xO (xI (xO (xI (xI (xO p)))))
127+
| "r"%char => xI (xI (xO (xI (xI (xO p)))))
128+
| "s"%char => xO (xO (xI (xI (xI (xO p)))))
129+
| "t"%char => xI (xO (xI (xI (xI (xO p)))))
130+
| "u"%char => xO (xI (xI (xI (xI (xO p)))))
131+
| "v"%char => xI (xI (xI (xI (xI (xO p)))))
132+
| "w"%char => xO (xO (xO (xO (xO (xI p)))))
133+
| "x"%char => xI (xO (xO (xO (xO (xI p)))))
134+
| "y"%char => xO (xI (xO (xO (xO (xI p)))))
135+
| "z"%char => xI (xI (xO (xO (xO (xI p)))))
136+
| "A"%char => xO (xO (xI (xO (xO (xI p)))))
137+
| "B"%char => xI (xO (xI (xO (xO (xI p)))))
138+
| "C"%char => xO (xI (xI (xO (xO (xI p)))))
139+
| "D"%char => xI (xI (xI (xO (xO (xI p)))))
140+
| "E"%char => xO (xO (xO (xI (xO (xI p)))))
141+
| "F"%char => xI (xO (xO (xI (xO (xI p)))))
142+
| "G"%char => xO (xI (xO (xI (xO (xI p)))))
143+
| "H"%char => xI (xI (xO (xI (xO (xI p)))))
144+
| "I"%char => xO (xO (xI (xI (xO (xI p)))))
145+
| "J"%char => xI (xO (xI (xI (xO (xI p)))))
146+
| "K"%char => xO (xI (xI (xI (xO (xI p)))))
147+
| "L"%char => xI (xI (xI (xI (xO (xI p)))))
148+
| "M"%char => xO (xO (xO (xO (xI (xI p)))))
149+
| "N"%char => xI (xO (xO (xO (xI (xI p)))))
150+
| "O"%char => xO (xI (xO (xO (xI (xI p)))))
151+
| "P"%char => xI (xI (xO (xO (xI (xI p)))))
152+
| "Q"%char => xO (xO (xI (xO (xI (xI p)))))
153+
| "R"%char => xI (xO (xI (xO (xI (xI p)))))
154+
| "S"%char => xO (xI (xI (xO (xI (xI p)))))
155+
| "T"%char => xI (xI (xI (xO (xI (xI p)))))
156+
| "U"%char => xO (xO (xO (xI (xI (xI p)))))
157+
| "V"%char => xI (xO (xO (xI (xI (xI p)))))
158+
| "W"%char => xO (xI (xO (xI (xI (xI p)))))
159+
| "X"%char => xI (xI (xO (xI (xI (xI p)))))
160+
| "Y"%char => xO (xO (xI (xI (xI (xI p)))))
161+
| "Z"%char => xI (xO (xI (xI (xI (xI p)))))
162+
| "_"%char => xO (xI (xI (xI (xI (xI p)))))
163+
| _ => append_char_pos_default c p
164+
end.
165+
166+
Fixpoint ident_of_string (s: string) : ident :=
167+
match s with
168+
| EmptyString => xH
169+
| String c s => append_char_pos c (ident_of_string s)
170+
end.
171+
172+
(** A convenient notation [$ "ident"] to force evaluation of
173+
[ident_of_string "ident"] *)
174+
175+
Ltac ident_of_string s :=
176+
let x := constr:(ident_of_string s) in
177+
let y := eval compute in x in
178+
exact y.
179+
180+
Notation "$ s" := (ltac:(ident_of_string s))
181+
(at level 1, only parsing) : string_scope.

exportclight/Clightgen.ml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ Recognized source files:
9898
.i or .p C source file that should not be preprocessed
9999
Processing options:
100100
-normalize Normalize the generated Clight code w.r.t. loads in expressions
101+
-canonical-idents Use canonical numbers to represent identifiers (default)
102+
-short-idents Use canonical numbers to represent identifiers
101103
-E Preprocess only, send result to standard output
102104
-o <file> Generate output in <file>
103105
|} ^
@@ -142,6 +144,8 @@ let cmdline_actions =
142144
(* Processing options *)
143145
[ Exact "-E", Set option_E;
144146
Exact "-normalize", Set option_normalize;
147+
Exact "-canonical-idents", Set Camlcoq.use_canonical_atoms;
148+
Exact "-short-idents", Unset Camlcoq.use_canonical_atoms;
145149
Exact "-o", String(fun s -> option_o := Some s);
146150
Prefix "-o", Self (fun s -> let s = String.sub s 2 ((String.length s) - 2) in
147151
option_o := Some s);]
@@ -175,20 +179,21 @@ let cmdline_actions =
175179
]
176180

177181
let _ =
178-
try
182+
try
179183
Gc.set { (Gc.get()) with
180184
Gc.minor_heap_size = 524288; (* 512k *)
181185
Gc.major_heap_increment = 4194304 (* 4M *)
182186
};
183187
Printexc.record_backtrace true;
188+
Camlcoq.use_canonical_atoms := true;
184189
Frontend.init ();
185190
parse_cmdline cmdline_actions;
186191
if !option_o <> None && !num_input_files >= 2 then
187192
fatal_error no_loc "Ambiguous '-o' option (multiple source files)";
188193
if !num_input_files = 0 then
189194
fatal_error no_loc "no input file";
190195
perform_actions ()
191-
with
196+
with
192197
| Sys_error msg
193198
| CmdError msg -> error no_loc "%s" msg; exit 2
194199
| Abort -> exit 2

exportclight/ExportClight.ml

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,12 @@ let define_idents p =
8181
string_of_atom
8282
(fun (id, name) ->
8383
try
84-
fprintf p "Definition _%s : ident := %ld%%positive.@ "
85-
(sanitize name) (P.to_int32 id)
84+
if id = pos_of_string name then
85+
fprintf p "Definition _%s : ident := $\"%s\".@ "
86+
(sanitize name) name
87+
else
88+
fprintf p "Definition _%s : ident := %ld%%positive.@ "
89+
(sanitize name) (P.to_int32 id)
8690
with Not_an_identifier ->
8791
());
8892
iter_hashtbl_sorted
@@ -93,9 +97,11 @@ let define_idents p =
9397
fprintf p "@ "
9498

9599
let name_temporary t =
96-
let t1 = P.to_int t and t0 = P.to_int (first_unused_ident ()) in
97-
if t1 >= t0 && not (Hashtbl.mem temp_names t)
98-
then Hashtbl.add temp_names t (sprintf "_t'%d" (t1 - t0 + 1))
100+
if not (Hashtbl.mem string_of_atom t) && not (Hashtbl.mem temp_names t)
101+
then begin
102+
let t1 = P.to_int t and t0 = P.to_int (first_unused_ident ()) in
103+
Hashtbl.add temp_names t (sprintf "_t'%d" (t1 - t0 + 1))
104+
end
99105

100106
let name_opt_temporary = function
101107
| None -> ()
@@ -468,7 +474,7 @@ let print_assertion p (txt, targs) =
468474
| Text _ -> ()
469475
| Param n -> max_param := max n !max_param)
470476
frags;
471-
fprintf p " | \"%s\"%%string, " txt;
477+
fprintf p " | \"%s\", " txt;
472478
list_iteri
473479
(fun i targ -> fprintf p "_x%d :: " (i + 1))
474480
targs;
@@ -495,7 +501,8 @@ let print_assertions p =
495501
let prologue = "\
496502
From Coq Require Import String List ZArith.\n\
497503
From compcert Require Import Coqlib Integers Floats AST Ctypes Cop Clight Clightdefs.\n\
498-
Local Open Scope Z_scope.\n"
504+
Local Open Scope Z_scope.\n\
505+
Local Open Scope string_scope.\n"
499506

500507
(* Naming the compiler-generated temporaries occurring in the program *)
501508

@@ -554,15 +561,15 @@ let name_program p =
554561

555562
let print_clightgen_info p sourcefile normalized =
556563
fprintf p "@[<v 2>Module Info.";
557-
fprintf p "@ Definition version := %S%%string." Version.version;
558-
fprintf p "@ Definition build_number := %S%%string." Version.buildnr;
559-
fprintf p "@ Definition build_tag := %S%%string." Version.tag;
560-
fprintf p "@ Definition arch := %S%%string." Configuration.arch;
561-
fprintf p "@ Definition model := %S%%string." Configuration.model;
562-
fprintf p "@ Definition abi := %S%%string." Configuration.abi;
564+
fprintf p "@ Definition version := %S." Version.version;
565+
fprintf p "@ Definition build_number := %S." Version.buildnr;
566+
fprintf p "@ Definition build_tag := %S." Version.tag;
567+
fprintf p "@ Definition arch := %S." Configuration.arch;
568+
fprintf p "@ Definition model := %S." Configuration.model;
569+
fprintf p "@ Definition abi := %S." Configuration.abi;
563570
fprintf p "@ Definition bitsize := %d." (if Archi.ptr64 then 64 else 32);
564571
fprintf p "@ Definition big_endian := %B." Archi.big_endian;
565-
fprintf p "@ Definition source_file := %S%%string." sourcefile;
572+
fprintf p "@ Definition source_file := %S." sourcefile;
566573
fprintf p "@ Definition normalized := %B." normalized;
567574
fprintf p "@]@ End Info.@ @ "
568575

lib/Camlcoq.ml

Lines changed: 76 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -282,23 +282,96 @@ type atom = positive
282282
let atom_of_string = (Hashtbl.create 17 : (string, atom) Hashtbl.t)
283283
let string_of_atom = (Hashtbl.create 17 : (atom, string) Hashtbl.t)
284284
let next_atom = ref Coq_xH
285+
let use_canonical_atoms = ref false
286+
287+
(* If [use_canonical_atoms] is false, strings are numbered from 1 up
288+
in the order in which they are encountered. This produces small
289+
numbers, and is therefore efficient, but the number for a given
290+
string may differ between the compilation of different units.
291+
292+
If [use_canonical_atoms] is true, strings are Huffman-encoded as bit
293+
sequences, which are then encoded as positive numbers. The same
294+
string is always represented by the same number in all compilation
295+
units. However, the numbers are bigger than in the first
296+
implementation. Also, this places a hard limit on the number of
297+
fresh identifiers that can be generated starting with
298+
[first_unused_ident]. *)
299+
300+
let rec append_bits_pos nbits n p =
301+
if nbits <= 0 then p else
302+
if n land 1 = 0
303+
then Coq_xO (append_bits_pos (nbits - 1) (n lsr 1) p)
304+
else Coq_xI (append_bits_pos (nbits - 1) (n lsr 1) p)
305+
306+
(* The encoding of strings as bit sequences is optimized for C identifiers:
307+
- numbers are encoded as a 6-bit integer between 0 and 9
308+
- lowercase letters are encoded as a 6-bit integer between 10 and 35
309+
- uppercase letters are encoded as a 6-bit integer between 36 and 61
310+
- the underscore character is encoded as the 6-bit integer 62
311+
- all other characters are encoded as 6 "one" bits followed by
312+
the 8-bit encoding of the character. *)
313+
314+
let append_char_pos c p =
315+
match c with
316+
| '0'..'9' -> append_bits_pos 6 (Char.code c - Char.code '0') p
317+
| 'a'..'z' -> append_bits_pos 6 (Char.code c - Char.code 'a' + 10) p
318+
| 'A'..'Z' -> append_bits_pos 6 (Char.code c - Char.code 'A' + 36) p
319+
| '_' -> append_bits_pos 6 62 p
320+
| _ -> append_bits_pos 6 63 (append_bits_pos 8 (Char.code c) p)
321+
322+
(* The empty string is represented as the positive "1", that is, [xH]. *)
323+
324+
let pos_of_string s =
325+
let rec encode i accu =
326+
if i < 0 then accu else encode (i - 1) (append_char_pos s.[i] accu)
327+
in encode (String.length s - 1) Coq_xH
328+
329+
let fresh_atom () =
330+
let a = !next_atom in
331+
next_atom := Pos.succ !next_atom;
332+
a
285333

286334
let intern_string s =
287335
try
288336
Hashtbl.find atom_of_string s
289337
with Not_found ->
290-
let a = !next_atom in
291-
next_atom := Pos.succ !next_atom;
338+
let a =
339+
if !use_canonical_atoms then pos_of_string s else fresh_atom () in
292340
Hashtbl.add atom_of_string s a;
293341
Hashtbl.add string_of_atom a s;
294342
a
343+
295344
let extern_atom a =
296345
try
297346
Hashtbl.find string_of_atom a
298347
with Not_found ->
299348
Printf.sprintf "$%d" (P.to_int a)
300349

301-
let first_unused_ident () = !next_atom
350+
(* Ignoring the terminating "1" bit, canonical encodings of strings can
351+
be viewed as lists of bits, formed by concatenation of 6-bit fragments
352+
(for letters, numbers, and underscore) and 14-bit fragments (for other
353+
characters). Hence, not all positive numbers are canonical encodings:
354+
only those whose log2 is of the form [6n + 14m].
355+
356+
Here are the first intervals of positive numbers corresponding to strings:
357+
- [1, 1] for the empty string
358+
- [2^6, 2^7-1] for one "compact" character
359+
- [2^12, 2^13-1] for two "compact" characters
360+
- [2^14, 2^14-1] for one "escaped" character
361+
362+
Hence, between 2^7 and 2^12 - 1, we have 3968 consecutive positive
363+
numbers that cannot be the encoding of a string. These are the positive
364+
numbers we'll use as temporaries in the SimplExpr pass if canonical
365+
atoms are in use.
366+
367+
If short atoms are used, we just number the temporaries consecutively
368+
starting one above the last generated atom.
369+
*)
370+
371+
let first_unused_ident () =
372+
if !use_canonical_atoms
373+
then P.of_int 128
374+
else !next_atom
302375

303376
(* Strings *)
304377

0 commit comments

Comments
 (0)