Skip to content

Commit ab8a684

Browse files
author
Mark Hildebrand
authored
Merge pull request #13 from IntelLabs/mh/v0.0.2
Upstream from internal repo.
2 parents 568cfe7 + d76175b commit ab8a684

File tree

159 files changed

+10948
-4832
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

159 files changed

+10948
-4832
lines changed

CMakeLists.txt

+11-6
Original file line numberDiff line numberDiff line change
@@ -48,19 +48,15 @@ target_compile_options(
4848
#####
4949

5050
include("cmake/options.cmake")
51+
5152
include("cmake/clang-tidy.cmake")
53+
include("cmake/eve.cmake")
5254
include("cmake/pthread.cmake")
5355
include("cmake/numa.cmake")
5456
include("cmake/robin-map.cmake")
5557
include("cmake/fmt.cmake")
5658
include("cmake/toml.cmake")
5759

58-
# TODO: Right now this is always needed.
59-
# Decoupling our dependence on EVE to disable quantization will be ... tricky ...
60-
if(SVS_ENABLE_QUANTIZATION)
61-
include("cmake/eve.cmake")
62-
endif()
63-
6460
#####
6561
##### Build Objects
6662
#####
@@ -81,6 +77,15 @@ if(SVS_BUILD_DOCS)
8177
add_subdirectory(docs)
8278
endif()
8379

80+
# The benchmark directory contains a sub-component that is used by both the benchmarking
81+
# framework and the unit-tests.
82+
#
83+
# If only the unit tests are enabled, then the benchmark will be built as a minimal
84+
# component to avoid excessive compilation time.
85+
if(SVS_BUILD_BENCHMARK OR SVS_BUILD_TESTS)
86+
add_subdirectory(benchmark)
87+
endif()
88+
8489
#####
8590
##### Install Logic
8691
#####

README.md

+7-7
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,12 @@ different configurations of SVS yield significantly increased performance (measu
2828
SVS is primarily optimized for large-scale similarity search but it still offers [state-of-the-art performance
2929
at million-scale](https://intellabs.github.io/ScalableVectorSearch/benchs/small_scale_benchs.html).
3030

31-
Best performance is obtained with 4th generation (Sapphire Rapids) by making use of AVX-512 instructions,
32-
with excellent results also with 2nd and 3rd Intel ® Xeon ® processors (Cascade Lake
31+
Best performance is obtained with 4th generation (Sapphire Rapids) by making use of AVX-512 instructions,
32+
with excellent results also with 2nd and 3rd Intel ® Xeon ® processors (Cascade Lake
3333
and Ice Lake).
3434

35-
Performance will be degraded if AVX-512 instructions are not available.
36-
A warning message will appear when loading the SVS Python module if the system does not support
35+
Performance will be degraded if AVX-512 instructions are not available.
36+
A warning message will appear when loading the SVS Python module if the system does not support
3737
AVX-512 instructions.
3838

3939
## Key Features
@@ -117,10 +117,10 @@ Reference to cite when you use SVS in a research paper:
117117
@article{aguerrebere2023similarity,
118118
title={Similarity search in the blink of an eye with compressed indices},
119119
volume = {16},
120-
number = {11},
121-
pages = {3433--3446},
120+
number = {11},
121+
pages = {3433--3446},
122122
journal = {Proceedings of the VLDB Endowment},
123-
author={Cecilia Aguerrebere and Ishwar Bhati and Mark Hildebrand and Mariano Tepper and Ted Willke},
123+
author={Cecilia Aguerrebere and Ishwar Bhati and Mark Hildebrand and Mariano Tepper and Ted Willke},
124124
year = {2023}
125125
}
126126
```

THIRD-PARTY-PROGRAMS

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3838
SOFTWARE.
3939

4040
--------------------------------------------------------------------------------
41-
2. fmtlib (cmake/fmt.cmake, https://github.com/fmtlib/fmt/tree/9.1.0)
41+
2. fmtlib (cmake/fmt.cmake, https://github.com/fmtlib/fmt/tree/10.1.0)
4242

4343
Copyright (c) 2012 - present, Victor Zverovich and {fmt} contributors
4444

benchmark/CMakeLists.txt

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# The svs-benchmark consists of two components:
2+
#
3+
# (1) A shared-library containing most of the implementation logic.
4+
# (2) A thin executable that provides a command-line interface to the shared-library.
5+
#
6+
# This is because svs-benchmark provides utilities that can be reused for testing.
7+
# This is done by linking the test executable to svs-benchmark shared library.
8+
9+
# As a compile-time optimization, if we're just building the tests, we disable all
10+
# specializations compiled in the shared library.
11+
if (${SVS_BUILD_BENCHMARK})
12+
set(SVS_BENCHMARK_MINIMAL OFF)
13+
else()
14+
set(SVS_BENCHMARK_MINIMAL ON)
15+
endif()
16+
17+
set(SHARED_LIBRARY_FILES
18+
src/benchmark.cpp
19+
src/vamana/build.cpp
20+
)
21+
22+
add_library(svs_benchmark_library SHARED ${SHARED_LIBRARY_FILES})
23+
target_include_directories(svs_benchmark_library PUBLIC ${CMAKE_CURRENT_LIST_DIR}/include)
24+
25+
if (${SVS_BENCHMARK_MINIMAL})
26+
message("Compiling minimal benchmark library")
27+
target_compile_options(svs_benchmark_library PUBLIC -DSVS_BENCHMARK_MINIMAL)
28+
endif()
29+
30+
# Link the library with the main SVS library.
31+
target_link_libraries(
32+
svs_benchmark_library
33+
PUBLIC
34+
${SVS_LIB}
35+
svs_compile_options
36+
svs_native_options
37+
fmt::fmt
38+
)
39+
40+
# Build the shared library.
41+
set(EXE_FILES src/main.cpp)
42+
add_executable(svs_benchmark ${EXE_FILES})
43+
target_link_libraries(svs_benchmark PRIVATE svs_benchmark_library)
+210
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
#pragma once
2+
3+
// svs
4+
#include "svs/core/data.h"
5+
#include "svs/lib/file.h"
6+
#include "svs/lib/saveload.h"
7+
#include "svs/quantization/lvq/lvq.h"
8+
#include "svs/third-party/toml.h"
9+
10+
// third-party
11+
#include "fmt/core.h"
12+
13+
// stl
14+
#include <filesystem>
15+
#include <optional>
16+
#include <span>
17+
#include <string>
18+
#include <string_view>
19+
20+
namespace svsbenchmark {
21+
22+
// Trait to determine if we're in a minimal build environment.
23+
#if defined(SVS_BENCHMARK_MINIMAL)
24+
inline constexpr bool is_minimal = true;
25+
#else
26+
inline constexpr bool is_minimal = false;
27+
#endif
28+
29+
// Serialize the TOML table to a file in a way that either succeeds in overwriting an
30+
// existing file at the path `path` or completely fails.
31+
//
32+
// Torn writes will be avoided.
33+
void atomic_save(const toml::table& table, const std::filesystem::path& path);
34+
35+
// Mutate `table` by appending `data` to an array stored at `table[key]`.
36+
// Create such an array if `table[key]` does not exist.
37+
//
38+
// Throws an `svs::ANNException` if the node at `table[key]` is not a `toml::array`.
39+
void append_or_create(toml::table& table, const toml::table& data, std::string_view key);
40+
41+
// Extract a file path from the given TOML table with an optional root to append.
42+
// Checks if the file exists or not.
43+
//
44+
// If the file does not exist, throw an ANNException with a descriptive error message.
45+
std::filesystem::path extract_filename(
46+
const toml::table& table,
47+
std::string_view key,
48+
const std::optional<std::filesystem::path>& root
49+
);
50+
51+
// Unified polymorphic type for running benchmarks.
52+
class Benchmark {
53+
public:
54+
Benchmark() = default;
55+
56+
Benchmark(const Benchmark&) = delete;
57+
Benchmark(Benchmark&&) = delete;
58+
59+
Benchmark& operator=(const Benchmark&) = delete;
60+
Benchmark& operator=(Benchmark&&) = delete;
61+
62+
std::string name() const { return do_name(); }
63+
int run(std::span<const std::string_view> args) const { return do_run(args); }
64+
65+
virtual ~Benchmark() = default;
66+
67+
protected:
68+
// Note for implementers: The name passed by `do_name()` will be used by the main
69+
// executable to dispatch to the backend benchmark. It should be unique and not
70+
// contain spaces.
71+
virtual std::string do_name() const = 0;
72+
73+
// The arguments given will be all the command-line arguments minus the first two:
74+
// Argument 0 is the executable name and not needed.
75+
// Argument 1 is used to dispatch to the appropriate backend.
76+
//
77+
// All the test are forwarded unaltered.
78+
virtual int do_run(std::span<const std::string_view>) const = 0;
79+
};
80+
81+
// In general, index builds can take a long time and it may be beneficial to two things:
82+
//
83+
// (1) Regularly save checkpoints of results as they are generated so that if the
84+
// application
85+
// fails, we do not lose all of our data.
86+
// (2) Provide results in as near real-time as we can so we can monitor currently running
87+
// processes to determine as early as possible if something has gone wrong.
88+
//
89+
// The Checkpoint class keeps a record of the current results generated so far, appends
90+
// new results as they become available and regularly saves results.
91+
//
92+
// It *does* involve many copies of the underlying TOML data, but I believe the tradoff in
93+
// data safety greatly outweights any extra time spent moving around TOML data.
94+
class Checkpoint {
95+
private:
96+
toml::table data_;
97+
std::filesystem::path filename_;
98+
99+
public:
100+
Checkpoint(const toml::table& data, const std::filesystem::path& filename)
101+
: data_{data}
102+
, filename_{filename} {}
103+
104+
void checkpoint(const toml::table& new_data, std::string_view key) const {
105+
// Make a copy of our current checkpointed data and try to append the new data
106+
// to the list.
107+
//
108+
// Make sure to handle the case where this is the first data being registered
109+
// with the given key.
110+
//
111+
// This is not the most efficient implemenation because we make unnecessary copies,
112+
// but the complexitry required to correctly applying incremental data does not
113+
// seem to be worth it.
114+
auto data_copy = data_;
115+
append_or_create(data_copy, new_data, key);
116+
atomic_save(data_copy, filename_);
117+
}
118+
};
119+
120+
/// Regular old uncompressed data.
121+
struct Uncompressed {
122+
// Sadly, we can't have computed constexpr string names yet :(
123+
constexpr static std::string_view name() { return "uncompressed"; }
124+
};
125+
126+
/// LVQ compression.
127+
/// Setting `Residual = 0` implies one-level LVQ.
128+
template <size_t Primary, size_t Residual = 0> struct LVQ {
129+
static std::string name() {
130+
if constexpr (Residual == 0) {
131+
return fmt::format("lvq{}", Primary);
132+
} else {
133+
return fmt::format("lvq{}x{}", Primary, Residual);
134+
}
135+
}
136+
};
137+
138+
///
139+
/// Take a collection of dispatch tag types. Requires the following of each type:
140+
///
141+
/// (1) Types are default constructible.
142+
/// (2) Types have a static `name()` method returing a `std::string` or `std::string_view`.
143+
///
144+
/// Iterates through the list of types trying to match the `name` argument to the types
145+
/// static `name()` result. If a match is found, call the callable `f` with a default
146+
/// constructed instance of the matching type.
147+
///
148+
/// Throws ANNException if no match is found.
149+
///
150+
template <typename F, typename T, typename... Ts>
151+
auto parse_dispatch(svs::lib::meta::Types<T, Ts...>, std::string_view name, F&& f) {
152+
if (name == T::name()) {
153+
return f(T());
154+
}
155+
156+
if constexpr (sizeof...(Ts) == 0) {
157+
throw ANNEXCEPTION("No dispatch type matching name {}", name);
158+
} else {
159+
return parse_dispatch(svs::lib::meta::Types<Ts...>(), name, std::forward<F>(f));
160+
}
161+
}
162+
163+
///
164+
/// Helper types to describe "extent"
165+
///
166+
struct Extent {
167+
public:
168+
size_t value_;
169+
170+
public:
171+
explicit Extent(size_t value)
172+
: value_{value} {}
173+
operator size_t() const { return value_; }
174+
};
175+
176+
} // namespace svsbenchmark
177+
178+
namespace svs::lib {
179+
template <> struct Saver<svsbenchmark::Extent> {
180+
static SaveNode save(svsbenchmark::Extent x) {
181+
if (x.value_ == Dynamic) {
182+
return SaveNode("dynamic");
183+
} else {
184+
return SaveNode(narrow<int64_t>(x.value_));
185+
}
186+
}
187+
};
188+
189+
template <> struct Loader<svsbenchmark::Extent> {
190+
using toml_type = toml::node;
191+
static constexpr bool is_version_free = true;
192+
static svsbenchmark::Extent load(const toml_type& node) {
193+
if (auto* v = node.as<std::string>(); v != nullptr) {
194+
const std::string& str = v->get();
195+
if (str == "dynamic") {
196+
return svsbenchmark::Extent(Dynamic);
197+
}
198+
throw ANNEXCEPTION(
199+
"Unrecognized string {} when trying to load extent from {}!",
200+
str,
201+
fmt::streamed(node.source())
202+
);
203+
}
204+
205+
// Try to get as an integer and fail hard if that doesn't work.
206+
auto u = toml_helper::get_as<int64_t>(node);
207+
return svsbenchmark::Extent(u == -1 ? Dynamic : narrow<size_t>(u));
208+
}
209+
};
210+
} // namespace svs::lib

0 commit comments

Comments
 (0)