Skip to content

Commit 233968b

Browse files
committed
Patched DDUP to use XXHASH3
revert strcpy, fix library linking properly print hash digest strings fix segfault on crash housekeeping Signed-off-by: Andrew Robbins [email protected]
1 parent 4b695cd commit 233968b

File tree

3 files changed

+36
-37
lines changed

3 files changed

+36
-37
lines changed

CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,9 @@ IF(LibCap_FOUND)
147147
LIST(APPEND MFU_EXTERNAL_LIBS ${LibCap_LIBRARIES})
148148
ENDIF(LibCap_FOUND)
149149

150-
## OPENSSL for ddup
151-
FIND_PACKAGE(OpenSSL)
150+
## XXHASH for ddup
151+
find_package(xxHash REQUIRED)
152+
INCLUDE_DIRECTORIES(${xxHash_INCLUDE_DIRS})
152153

153154
# Setup Installation
154155

src/ddup/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
MFU_ADD_TOOL(ddup)
2-
TARGET_LINK_LIBRARIES(ddup ${OPENSSL_LIBRARIES})
2+
TARGET_LINK_LIBRARIES(ddup xxhash)

src/ddup/ddup.c

Lines changed: 32 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
11
#include <stdio.h>
22
#include <string.h>
33
#include <getopt.h>
4-
#include <openssl/sha.h>
54
#include <assert.h>
65
#include <inttypes.h>
6+
#include <xxh3.h>
77
#include "mpi.h"
88
#include "dtcmp.h"
99
#include "mfu.h"
1010
#include "list.h"
1111

1212
/* number of uint64_t values in our key
13-
* 1 for group ID + (SHA256_DIGEST_LENGTH / 8) */
14-
#define DDUP_KEY_SIZE 5
15-
13+
* 1 for group ID + (XXHASH_DIGEST_LENGTH / 8) */
14+
#define DDUP_KEY_SIZE 2
15+
/*XXH3_64 output is a hexadecimal representation of an unsigned 64 bit integer*/
16+
#define XXH3_DIGEST_LENGTH 8
1617
/* amount of data to read in order to compute hash */
17-
#define DDUP_CHUNK_SIZE 1048576
18+
#define DDUP_CHUNK_SIZE 4096
1819

1920
/* Print a usage message */
2021
static void print_usage(void)
@@ -35,11 +36,11 @@ static void print_usage(void)
3536
/* create MPI datatypes for key and key and satellite data */
3637
static void mpi_type_init(MPI_Datatype* key, MPI_Datatype* keysat)
3738
{
38-
assert(SHA256_DIGEST_LENGTH == (DDUP_KEY_SIZE - 1) * 8);
39+
assert(XXH3_DIGEST_LENGTH == (DDUP_KEY_SIZE - 1) * 8);
3940

4041
/*
4142
* Build MPI datatype for key.
42-
* 1 for group ID + (SHA256_DIGEST_LENGTH / 8)
43+
* 1 for group ID + (XXH3_DIGEST_LENGTH / 8)
4344
*/
4445
MPI_Type_contiguous(DDUP_KEY_SIZE, MPI_UINT64_T, key);
4546
MPI_Type_commit(key);
@@ -141,14 +142,14 @@ static int read_data(const char* fname, char* chunk_buf, uint64_t chunk_id,
141142
}
142143

143144
struct file_item {
144-
SHA256_CTX ctx;
145+
XXH3_state_t state;
145146
};
146147

147-
/* print SHA256 value to stdout */
148-
static void dump_sha256_digest(char* digest_string, unsigned char digest[])
148+
/* print XXH3 value to stdout */
149+
static void dump_xxh3_digest(char* digest_string, unsigned char digest[])
149150
{
150151
int i;
151-
for (i = 0; i < SHA256_DIGEST_LENGTH; i++) {
152+
for (i = 0; i < XXH3_DIGEST_LENGTH; i++) {
152153
sprintf(&digest_string[i * 2], "%02x", (unsigned int)digest[i]);
153154
}
154155
}
@@ -161,7 +162,7 @@ int main(int argc, char** argv)
161162

162163
uint64_t chunk_size = DDUP_CHUNK_SIZE;
163164

164-
SHA256_CTX* ctx_ptr;
165+
XXH3_state_t* state_ptr;
165166

166167
MPI_Init(NULL, NULL);
167168
mfu_init();
@@ -310,8 +311,8 @@ int main(int argc, char** argv)
310311
/* get local number of items in flist */
311312
uint64_t checking_files = mfu_flist_size(flist);
312313

313-
/* allocate memory to hold SHA256 context values */
314-
struct file_item* file_items = (struct file_item*) MFU_MALLOC(checking_files * sizeof(*file_items));
314+
/* allocate memory to hold XXH3 context values */
315+
struct file_item* file_items = (struct file_item*) XXH_alignedMalloc(checking_files * sizeof(*file_items), 128);
315316

316317
/* Allocate two lists of length size, where each
317318
* element has (DDUP_KEY_SIZE + 1) uint64_t values
@@ -346,8 +347,9 @@ int main(int argc, char** argv)
346347
/* record our index in flist */
347348
ptr[DDUP_KEY_SIZE] = i;
348349

349-
/* initialize the SHA256 hash state for this file */
350-
SHA256_Init(&file_items[i].ctx);
350+
/* initialize the XXH3 hash state for this file */
351+
XXH3_INITSTATE(&file_items[i].state);
352+
XXH3_64bits_reset(&file_items[i].state);
351353

352354
/* increment our file count */
353355
new_checking_files++;
@@ -376,7 +378,7 @@ int main(int argc, char** argv)
376378
/* update the chunk id we'll read from all files */
377379
chunk_id++;
378380

379-
/* iterate over our list and compute SHA256 value for each */
381+
/* iterate over our list and compute XXH3 value for each */
380382
ptr = list;
381383
for (i = 0; i < checking_files; i++) {
382384
/* get the flist index for this item */
@@ -399,18 +401,14 @@ int main(int argc, char** argv)
399401
"process", fname);
400402
}
401403

402-
/* update the SHA256 context for this file */
403-
ctx_ptr = &file_items[idx].ctx;
404-
SHA256_Update(ctx_ptr, chunk_buf, data_size);
404+
/* update the XXH3 context for this file */
405+
state_ptr = &file_items[idx].state;
406+
XXH3_64bits_update(state_ptr, chunk_buf, data_size);
405407

406408
/*
407-
* Use SHA256 value as key.
408-
* This is actually an hack, but SHA256_Final can't
409-
* be called multiple times with out changing ctx
409+
* Use XXH3 digest as key.
410410
*/
411-
SHA256_CTX ctx_tmp;
412-
memcpy(&ctx_tmp, ctx_ptr, sizeof(ctx_tmp));
413-
SHA256_Final((unsigned char*)(ptr + 1), &ctx_tmp);
411+
XXH64_hash_t result = XXH3_64bits_digest(state_ptr);
414412

415413
/* move on to next file in the list */
416414
ptr += DDUP_KEY_SIZE + 1;
@@ -441,8 +439,8 @@ int main(int argc, char** argv)
441439
/* look up file size */
442440
file_size = mfu_flist_file_get_size(flist, idx);
443441

444-
/* get a pointer to the SHA256 context for this file */
445-
ctx_ptr = &file_items[idx].ctx;
442+
/* get a pointer to the XXH3 context for this file */
443+
state_ptr = &file_items[idx].state;
446444

447445
if (group_ranks[i] == 1) {
448446
/*
@@ -457,11 +455,11 @@ int main(int argc, char** argv)
457455
* duplicate with other files that also have
458456
* matching group_id[i]
459457
*/
460-
unsigned char digest[SHA256_DIGEST_LENGTH];
461-
SHA256_Final(digest, ctx_ptr);
462-
463-
char digest_string[SHA256_DIGEST_LENGTH * 2 + 1];
464-
dump_sha256_digest(digest_string, digest);
458+
XXH64_hash_t digest = XXH3_64bits_digest(state_ptr);
459+
XXH64_canonical_t digest_canon;
460+
XXH64_canonicalFromHash(&digest_canon, digest);
461+
char digest_string[XXH3_DIGEST_LENGTH];
462+
dump_xxh3_digest(digest_string, digest_canon.digest);
465463
printf("%s %s\n", fname, digest_string);
466464
} else {
467465
/* Have multiple files with the same checksum,
@@ -519,7 +517,7 @@ int main(int argc, char** argv)
519517
mfu_free(&group_id);
520518
mfu_free(&new_list);
521519
mfu_free(&list);
522-
mfu_free(&file_items);
520+
XXH_alignedFree(&file_items);
523521
mfu_free(&chunk_buf);
524522
mfu_flist_free(&flist);
525523

0 commit comments

Comments
 (0)