@@ -30,9 +30,21 @@ main([DirectoryName]) ->
30
30
Records = ets :tab2list (HashesTable ),
31
31
32
32
% Find duplicate hashes
33
- Duplicates = find_duplicate_hashes (Records ),
34
- io :format (" length: ~p~n " , [length (Records )]),
35
- io :format (" Duplicates: ~p~n " , [Duplicates ]),
33
+ [Record |Records2 ] = Records ,
34
+ Duplicates = find_duplicate_hashes (Record , Records2 , []),
35
+
36
+ % Convert file IDs back to filenames
37
+ IdsToFilenames = fun ({Hash , Locations }) ->
38
+ MappedLocations = lists :map (fun ({FileID , Offset }) ->
39
+ [{_ , Filename }] = ets :lookup (FilenameTable , FileID ),
40
+ {Filename , Offset }
41
+ end , Locations ),
42
+ {Hash , MappedLocations }
43
+ end ,
44
+ PrintableDuplicates = lists :map (IdsToFilenames , Duplicates ),
45
+
46
+ % Print the results
47
+ io :format (" Duplicates: ~p~n " , [PrintableDuplicates ]),
36
48
ok ;
37
49
38
50
main (_ ) ->
@@ -41,34 +53,35 @@ main(_) ->
41
53
halt (1 ).
42
54
43
55
process_file ({FileID , Filename }, HashesTable ) ->
44
- compute_hashes (Filename , FileID , 0 , HashesTable ).
56
+ compute_hashes (Filename , FileID , HashesTable ).
45
57
46
- compute_hashes (Filename , FileID , Offset , HashesTable ) ->
58
+ compute_hashes (Filename , FileID , HashesTable ) ->
47
59
case file :read_file (Filename ) of
48
60
{ok , Data } ->
49
61
Hashes = rolling_hash (Data ),
50
62
lists :foldl (fun (Hash , Index ) ->
51
- ets :insert (HashesTable , {Hash , {FileID , Index }}),
63
+ true = ets :insert (HashesTable , {Hash , {FileID , Index }}),
52
64
Index + 1
53
- end , 0 , Hashes );
65
+ end , 0 , Hashes ),
66
+ ok ;
54
67
eof ->
55
68
ok
56
69
end .
57
70
58
- find_duplicate_hashes (Records ) ->
59
- lists : foldl ( fun ( {Hash , {FileID , Offset }} , Acc ) ->
60
- case lists : filter ( fun ({SecondHash , {SecondFileID , _ } }) ->
61
- (Hash =:= SecondHash ) and (FileID =/= SecondFileID )
62
- end , Records ) of
63
- [] ->
64
- Acc ;
65
- Values ->
66
- Processed = lists : map ( fun ({ Hash , { FileID , Offset }}) ->
67
- {FileID , Offset }
68
- end , Values ),
69
- [{ Hash , [{ FileID , Offset }| Processed ]}| Acc ]
70
- end
71
- end , [], Records ) .
71
+ find_duplicate_hashes (_Record , [], Acc ) -> Acc ;
72
+ find_duplicate_hashes ( {Hash , {FileID , Offset } = Location }, [ NewRecord | NewRecords ] = Records , Acc ) ->
73
+ DuplicateFinder = fun ({SecondHash , {SecondFileID , SecondFileOffset } = SecondLocation }) ->
74
+ (Hash =:= SecondHash ) and (Location =/= SecondLocation )
75
+ end ,
76
+ case lists : filter ( DuplicateFinder , Records ) of
77
+ [] ->
78
+ find_duplicate_hashes ( NewRecord , NewRecords , Acc );
79
+ Values ->
80
+ Processed = lists : map ( fun ({ Hash , {FileID , Offset }}) ->
81
+ { FileID , Offset }
82
+ end , Values ),
83
+ find_duplicate_hashes ( NewRecord , NewRecords , [{ Hash , [{ FileID , Offset }| Processed ]}| Acc ])
84
+ end .
72
85
73
86
% Probably not the best way to implement a rolling hash, but I was short on time
74
87
rolling_hash (Data ) ->
@@ -88,6 +101,10 @@ compute_next_hash(LastHash, <<LastByte:1/binary, Chunk/binary>>, <<NextByte:1/bi
88
101
Hash = hash_string (LastByte , LastHash , NextByte ),
89
102
[Hash |compute_next_hash (Hash , <<Chunk /binary , NextByte /binary >>, NewRest )].
90
103
104
+ % Below are the functions for a very primitive implementation of the Rabin-Karp
105
+ % string search algorithm. This implementation tends to return a lot of false
106
+ % matches.
107
+
91
108
% String to integer hash
92
109
hash_string (String ) ->
93
110
Integers = [Byte || <<Byte >> <= String ],
0 commit comments