Skip to content

Commit c5273f9

Browse files
committed
Complete exercise 19.3
1 parent 78df1f7 commit c5273f9

File tree

1 file changed

+38
-21
lines changed

1 file changed

+38
-21
lines changed

chapter_19/exercise_3/plagiarism_detector

+38-21
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,21 @@ main([DirectoryName]) ->
3030
Records = ets:tab2list(HashesTable),
3131

3232
% Find duplicate hashes
33-
Duplicates = find_duplicate_hashes(Records),
34-
io:format("length: ~p~n", [length(Records)]),
35-
io:format("Duplicates: ~p~n", [Duplicates]),
33+
[Record|Records2] = Records,
34+
Duplicates = find_duplicate_hashes(Record, Records2, []),
35+
36+
% Convert file IDs back to filenames
37+
IdsToFilenames = fun({Hash, Locations}) ->
38+
MappedLocations = lists:map(fun({FileID, Offset}) ->
39+
[{_, Filename}] = ets:lookup(FilenameTable, FileID),
40+
{Filename, Offset}
41+
end, Locations),
42+
{Hash, MappedLocations}
43+
end,
44+
PrintableDuplicates = lists:map(IdsToFilenames, Duplicates),
45+
46+
% Print the results
47+
io:format("Duplicates: ~p~n", [PrintableDuplicates]),
3648
ok;
3749

3850
main(_) ->
@@ -41,34 +53,35 @@ main(_) ->
4153
halt(1).
4254

4355
process_file({FileID, Filename}, HashesTable) ->
44-
compute_hashes(Filename, FileID, 0, HashesTable).
56+
compute_hashes(Filename, FileID, HashesTable).
4557

46-
compute_hashes(Filename, FileID, Offset, HashesTable) ->
58+
compute_hashes(Filename, FileID, HashesTable) ->
4759
case file:read_file(Filename) of
4860
{ok, Data} ->
4961
Hashes = rolling_hash(Data),
5062
lists:foldl(fun(Hash, Index) ->
51-
ets:insert(HashesTable, {Hash, {FileID, Index}}),
63+
true = ets:insert(HashesTable, {Hash, {FileID, Index}}),
5264
Index + 1
53-
end, 0, Hashes);
65+
end, 0, Hashes),
66+
ok;
5467
eof ->
5568
ok
5669
end.
5770

58-
find_duplicate_hashes(Records) ->
59-
lists:foldl(fun({Hash, {FileID, Offset}}, Acc) ->
60-
case lists:filter(fun({SecondHash, {SecondFileID, _}}) ->
61-
(Hash =:= SecondHash) and (FileID =/= SecondFileID)
62-
end, Records) of
63-
[] ->
64-
Acc;
65-
Values ->
66-
Processed = lists:map(fun({Hash, {FileID, Offset}}) ->
67-
{FileID, Offset}
68-
end, Values),
69-
[{Hash, [{FileID, Offset}|Processed]}|Acc]
70-
end
71-
end, [], Records).
71+
find_duplicate_hashes(_Record, [], Acc) -> Acc;
72+
find_duplicate_hashes({Hash, {FileID, Offset} = Location}, [NewRecord|NewRecords] = Records, Acc) ->
73+
DuplicateFinder = fun({SecondHash, {SecondFileID, SecondFileOffset} = SecondLocation}) ->
74+
(Hash =:= SecondHash) and (Location =/= SecondLocation)
75+
end,
76+
case lists:filter(DuplicateFinder, Records) of
77+
[] ->
78+
find_duplicate_hashes(NewRecord, NewRecords, Acc);
79+
Values ->
80+
Processed = lists:map(fun({Hash, {FileID, Offset}}) ->
81+
{FileID, Offset}
82+
end, Values),
83+
find_duplicate_hashes(NewRecord, NewRecords, [{Hash, [{FileID, Offset}|Processed]}|Acc])
84+
end.
7285

7386
% Probably not the best way to implement a rolling hash, but I was short on time
7487
rolling_hash(Data) ->
@@ -88,6 +101,10 @@ compute_next_hash(LastHash, <<LastByte:1/binary, Chunk/binary>>, <<NextByte:1/bi
88101
Hash = hash_string(LastByte, LastHash, NextByte),
89102
[Hash|compute_next_hash(Hash, <<Chunk/binary, NextByte/binary>>, NewRest)].
90103

104+
% Below are the functions for a very primitive implementation of the Rabin-Karp
105+
% string search algorithm. This implementation tends to return a lot of false
106+
% matches.
107+
91108
% String to integer hash
92109
hash_string(String) ->
93110
Integers = [Byte || <<Byte>> <= String],

0 commit comments

Comments
 (0)