Skip to content

Implement memory-mapped IO and multi-threading for BLAKE3 hashing #12676

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions packaging/dependencies.nix
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ scope: {
"--with-container"
"--with-context"
"--with-coroutine"
"--with-iostreams"
];
}).overrideAttrs
(old: {
Expand Down
19 changes: 17 additions & 2 deletions src/libutil/file-system.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#include <sys/time.h>
#include <unistd.h>

#include <boost/iostreams/device/mapped_file.hpp>

#ifdef _WIN32
# include <io.h>
#endif
Expand Down Expand Up @@ -273,9 +275,22 @@ std::string readFile(const std::filesystem::path & path)
return readFile(os_string_to_string(PathViewNG { path }));
}


void readFile(const Path & path, Sink & sink)
void readFile(const Path & path, Sink & sink, bool memory_map)
{
// Memory-map the file for faster processing where possible.
if (memory_map) {
try {
boost::iostreams::mapped_file_source mmap(path);
if (mmap.is_open()) {
sink({mmap.data(), mmap.size()});
return;
}
} catch (const boost::exception & e) {
}
debug("memory-mapping failed for path: %s", path);
}

// Stream the file instead if memory-mapping fails or is disabled.
AutoCloseFD fd = toDescriptor(open(path.c_str(), O_RDONLY
// TODO
#ifndef _WIN32
Expand Down
22 changes: 21 additions & 1 deletion src/libutil/hash.cc
Original file line number Diff line number Diff line change
Expand Up @@ -307,11 +307,31 @@ static void start(HashAlgorithm ha, Ctx & ctx)
else if (ha == HashAlgorithm::SHA512) SHA512_Init(&ctx.sha512);
}

// BLAKE3 data size threshold beyond which parallel hashing with TBB is likely faster.
//
// NOTE: This threshold is based on the recommended rule-of-thumb from the official BLAKE3 documentation for typical
// x86_64 hardware as of 2025. In the future it may make sense to allow the user to tune this through nix.conf.
const size_t blake3TbbThreshold = 128000;

// Decide which BLAKE3 update strategy to use based on some heuristics. Currently this just checks the data size but in
// the future it might also take into consideration available system resources or the presence of a shared-memory
// capable GPU for a heterogenous compute implementation.
void blake3_hasher_update_with_heuristics(blake3_hasher * blake3, std::string_view data)
{
#ifdef BLAKE3_USE_TBB
if (data.size() >= blake3TbbThreshold) {
blake3_hasher_update_tbb(blake3, data.data(), data.size());
} else
#endif
{
blake3_hasher_update(blake3, data.data(), data.size());
}
}

static void update(HashAlgorithm ha, Ctx & ctx,
std::string_view data)
{
if (ha == HashAlgorithm::BLAKE3) blake3_hasher_update(&ctx.blake3, data.data(), data.size());
if (ha == HashAlgorithm::BLAKE3) blake3_hasher_update_with_heuristics(&ctx.blake3, data);
else if (ha == HashAlgorithm::MD5) MD5_Update(&ctx.md5, data.data(), data.size());
else if (ha == HashAlgorithm::SHA1) SHA1_Update(&ctx.sha1, data.data(), data.size());
else if (ha == HashAlgorithm::SHA256) SHA256_Update(&ctx.sha256, data.data(), data.size());
Expand Down
2 changes: 1 addition & 1 deletion src/libutil/include/nix/util/file-system.hh
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ Descriptor openDirectory(const std::filesystem::path & path);
*/
std::string readFile(const Path & path);
std::string readFile(const std::filesystem::path & path);
void readFile(const Path & path, Sink & sink);
void readFile(const Path & path, Sink & sink, bool memory_map = true);

/**
* Write a string to a file.
Expand Down
5 changes: 3 additions & 2 deletions src/libutil/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,14 @@ endif

blake3 = dependency(
'libblake3',
version: '>= 1.5.5',
version: '>= 1.8.2',
method : 'pkg-config',
)
deps_private += blake3

boost = dependency(
'boost',
modules : ['context', 'coroutine'],
modules : ['context', 'coroutine', 'iostreams'],
include_type: 'system',
)
# boost is a public dependency, but not a pkg-config dependency unfortunately, so we
Expand Down
Loading