-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathParallelMergeSortBenchmark.cpp
284 lines (249 loc) · 14.2 KB
/
ParallelMergeSortBenchmark.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
//#include <oneapi/dpl/execution>
//#include <oneapi/dpl/algorithm>
#include <iostream>
#include <algorithm>
#include <chrono>
#include <random>
#include <ratio>
#include <vector>
#include <execution>
#include "ParallelMergeSort.h"
#include "SortParallel.h"
using std::chrono::duration;
using std::chrono::duration_cast;
using std::chrono::high_resolution_clock;
using std::milli;
using std::random_device;
using std::sort;
using std::vector;
const int iterationCount = 5;
static void print_results(const char *const tag, const double * sorted, size_t sortedLength,
high_resolution_clock::time_point startTime,
high_resolution_clock::time_point endTime) {
printf("%s: Lowest: %g Highest: %g Time: %fms\n", tag,
sorted[0], sorted[sortedLength - 1],
duration_cast<duration<double, milli>>(endTime - startTime).count());
}
static void print_results(const char* const tag, const unsigned* sorted, size_t sortedLength,
high_resolution_clock::time_point startTime,
high_resolution_clock::time_point endTime) {
printf("%s: Lowest: %u Highest: %u Time: %fms\n", tag,
sorted[0], sorted[sortedLength - 1],
duration_cast<duration<double, milli>>(endTime - startTime).count());
}
int ParallelMergeSortBenchmark(vector<double>& doubles)
{
// generate some random uints:
printf("\nBenchmarking Parallel Merge Sort Hybrid with %zu doubles...\n", doubles.size());
double* doublesCopy = new double[doubles.size()];
double* doublesCopy2 = new double[doubles.size()];
double* sorted = new double[doubles.size()];
vector<double> doublesCopyVec(doubles);
// time how long it takes to sort them:
for (int i = 0; i < iterationCount; ++i)
{
for (size_t j = 0; j < doubles.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting
doublesCopy[ j] = doubles[j];
doublesCopy2[ j] = doubles[j];
doublesCopyVec[j] = doubles[j];
sorted[ j] = (double)j; // page in the destination array into system memory
}
const auto startTime = high_resolution_clock::now();
ParallelAlgorithms::sort_par(doublesCopy, doubles.size(), sorted, doubles.size(), false); // not-in-place interface
//ParallelAlgorithms::sort_par(doublesCopy, doubles.size()); // in-place adaptive interface
//ParallelAlgorithms::sort_par(doublesCopyVec); // in-place adaptive interface (vector)
const auto endTime = high_resolution_clock::now();
//printf("ParallelAlgorithms sorting is done\n");
sort(std::execution::par_unseq, doublesCopy2, doublesCopy2 + doubles.size());
//sort(std::execution::par_unseq, ulongsCopyVec2.begin(), ulongsCopyVec2.end());
//if (std::equal(sorted, sorted + uints.size(), uintsCopy2))
if (!std::equal(doublesCopy, doublesCopy + doubles.size(), doublesCopy2))
{
std::cout << "Arrays are not equal ";
exit(1);
}
print_results("Parallel Merge Sort", sorted, doubles.size(), startTime, endTime);
}
delete[] sorted;
delete[] doublesCopy;
return 0;
}
int ParallelMergeSortBenchmark(vector<unsigned>& uints, const size_t& testSize)
{
// generate some random uints:
//printf("\nBenchmarking Parallel Merge Sort Hybrid with %zu unsigned longs (each of %lu bytes)...\n", uints.size(), (unsigned long)sizeof(unsigned long));
//const size_t testSize = 1'000'000'000;
unsigned* uintsCopy = new unsigned[testSize];
unsigned* uintsCopy2 = new unsigned[testSize];
unsigned* sorted = new unsigned[testSize];
//printf("Allocations of arrays succeeded using new\n");
//vector<unsigned long> ulongsCopyVec(uints);
//vector<unsigned long> ulongsCopyVec2(uints);
//printf("Allocations of arrays succeeded using vector\n");
// time how long it takes to sort them:
for (int i = 0; i < iterationCount; ++i)
{
for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting
uintsCopy[ j] = uints[j];
uintsCopy2[j] = uints[j];
sorted[j] = (unsigned long)j; // page in the destination array into system memory
//ulongsCopyVec[ j] = uints[j];
//ulongsCopyVec2[j] = uints[j];
}
const auto startTime = high_resolution_clock::now();
// Example of usages, which trade off ease of use and performance
//ParallelAlgorithms::sort_par(uintsCopy, uints.size()); // in-place adaptive interface
//ParallelAlgorithms::sort_par(uintsCopy, 0, uints.size()); // in-place adaptive interface
//ParallelAlgorithms::sort_par(uintsCopy, uints.size(), sorted, uints.size(), false); // in-place interface
//ParallelAlgorithms::sort_par(uintsCopy, uints.size(), sorted, uints.size(), true); // not in-place interface
//ParallelAlgorithms::sort_par(uintsCopy, 0, uints.size(), sorted, uints.size(), false); // in-place interface
//ParallelAlgorithms::sort_par(ulongsCopyVec); // in-place adaptive interface (vector)
//sort(ulongsCopyVec.begin(), ulongsCopyVec.end()); // in-place adaptive interface (vector)
//ParallelAlgorithms::merge_sort(uintsCopy, 0, uints.size() - 1, sorted, false);
//ParallelAlgorithms::merge_sort_hybrid(uintsCopy, 0, uints.size() - 1, sorted, false);
//ParallelAlgorithms::parallel_merge_sort_hybrid(uintsCopy, 0, uints.size() - 1, sorted, false);
//ParallelAlgorithms::parallel_merge_sort_hybrid_rh(uintsCopy, 0, uints.size() - 1, sorted, false);
//ParallelAlgorithms::parallel_merge_sort_hybrid_rh_1(uintsCopy, 0, uints.size() - 1, sorted, false);
//ParallelAlgorithms::parallel_merge_merge_sort_hybrid(uintsCopy, 0, uints.size() - 1, sorted, false, uints.size() / 8);
//ParallelAlgorithms::parallel_merge_merge_sort_hybrid(uintsCopy, 0, uints.size() - 1, sorted, false);
//ParallelAlgorithms::parallel_merge_sort_hybrid_radix(uintsCopy, 0, (int)(uints.size() - 1), sorted, false, uints.size() / 8); // ParallelMergeSort modifies the source array (using 8-cores get highest performance on 48-core CPU C5i)
ParallelAlgorithms::parallel_merge_sort_hybrid_radix(uintsCopy, 0, uints.size() - 1, sorted, false);
//ParallelAlgorithms::parallel_inplace_merge_sort_radix_hybrid(uintsCopy, 0, uints.size() - 1, uints.size() / 4); // using 4 cores best performance on 6-core AWS node
//ParallelAlgorithms::parallel_inplace_merge_sort_radix_hybrid(uintsCopy, 0, uints.size() - 1, uints.size() / 18); // using 18 cores best performance on C5.24xlarge 48-core AWS node
//RadixSortLSDPowerOf2Radix_unsigned_TwoPhase(uintsCopy, sorted, uints.size());
//RadixSortLSDPowerOf2Radix_unsigned_TwoPhase_DeRandomize(uintsCopy, sorted, uints.size())
const auto endTime = high_resolution_clock::now();
//printf("ParallelAlgorithms sorting is done\n");
sort(std::execution::par_unseq, uintsCopy2, uintsCopy2 + uints.size());
//sort(std::execution::par_unseq, ulongsCopyVec2.begin(), ulongsCopyVec2.end());
//if (std::equal(sorted, sorted + uints.size(), uintsCopy2))
if (!std::equal(uintsCopy, uintsCopy + uints.size(), uintsCopy2))
{
std::cout << "Arrays are not equal ";
exit(1);
}
print_results("Parallel Merge Sort", sorted, uints.size(), startTime, endTime);
}
delete[] sorted;
delete[] uintsCopy2;
delete[] uintsCopy;
return 0;
}
int ParallelInPlaceMergeSortBenchmark(vector<unsigned>& uints)
{
// generate some random uints:
printf("\nBenchmarking InPlace Parallel Merge Sort Hybrid with %zu unsigned (each of %u bytes)...\n", uints.size(), (unsigned)sizeof(unsigned));
unsigned* uintsCopy = new unsigned[uints.size()];
unsigned* uintsCopy2 = new unsigned[uints.size()];
unsigned* sorted = new unsigned[uints.size()];
// time how long it takes to sort them:
for (int i = 0; i < iterationCount; ++i)
{
for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting
uintsCopy2[j] = uints[j];
uintsCopy[ j] = uints[j];
sorted[ j] = (unsigned)j; // page in the destination array into system memory
}
const auto startTime = high_resolution_clock::now();
//ParallelAlgorithms::parallel_merge_sort_hybrid_rh_1(uintsCopy, 0, uints.size() - 1, sorted); // ParallelMergeSort modifies the source array
//ParallelAlgorithms::merge_sort_bottom_up_inplace(uintsCopy, 0, uints.size());
//ParallelAlgorithms::merge_sort_bottom_up_inplace_hybrid(uintsCopy, 0, uints.size());
//ParallelAlgorithms::merge_sort_inplace(uintsCopy, 0, uints.size() - 1);
ParallelAlgorithms::merge_sort_inplace_hybrid_with_insertion(uintsCopy, 0, uints.size() - 1);
//ParallelAlgorithms::merge_sort_inplace_hybrid_with_sort(uintsCopy, 0, uints.size() - 1, false);
//std::cout << "Before parallel inplace merge sort" << std::endl;
//parallel_inplace_merge_sort_hybrid_inner(uintsCopy2, 0, (int)(uints.size() - 1));
//ParallelAlgorithms::parallel_inplace_merge_sort_hybrid(uintsCopy, 0, uints.size() - 1, true, uints.size() / 4);
//ParallelAlgorithms::parallel_inplace_merge_sort_hybrid(uintsCopy, 0, uints.size() - 1, false, uints.size() / 48);
//ParallelAlgorithms::preventative_adaptive_inplace_merge_sort(uintsCopy, 0, uints.size() - 1, 0.75);
//ParallelAlgorithms::parallel_preventative_adaptive_inplace_merge_sort(uintsCopy, 0, uints.size() - 1, 0.75);
//ParallelAlgorithms::parallel_preventative_adaptive_inplace_merge_sort(uintsCopy, 0, uints.size() - 1, false, 0.01, uints.size() / 48); // threshold 48 or 32 * 1024
//ParallelAlgorithms::parallel_preventative_adaptive_inplace_merge_sort_2(uintsCopy, 0, uints.size() - 1, 0.9, uints.size() / 24); // threshold 48 or 32 * 1024
//ParallelAlgorithms::parallel_linear_in_place_preventative_adaptive_sort(uintsCopy, (unsigned long)uints.size(), true, 0.01, uints.size() / 6); // using 4-cores is fastest on 6-core CPU
//ParallelAlgorithms::parallel_linear_in_place_preventative_adaptive_sort(uintsCopy, (unsigned long)uints.size(), true, 0.9, uints.size() / 8); // using 8-cores is fastest on 48-core CPU
//ParallelAlgorithms::parallel_linear_in_place_preventative_adaptive_sort(uintsCopy, (unsigned long)uints.size(), false, 0.01, uints.size() / 24);
//std::sort(uintsCopy, uintsCopy + uints.size());
const auto endTime = high_resolution_clock::now();
std::sort(std::execution::par_unseq, uintsCopy2, uintsCopy2 + uints.size());
//std::stable_sort(std::execution::par_unseq, uintsCopy2, uintsCopy2 + uints.size());
//if (std::equal(sorted, sorted + uints.size(), uintsCopy2))
if (std::equal(uintsCopy, uintsCopy + uints.size(), uintsCopy2))
std::cout << "Arrays are equal ";
else
std::cout << "Arrays are not equal ";
print_results("Parallel InPlace Merge Sort", uintsCopy, uints.size(), startTime, endTime);
}
delete[] sorted;
delete[] uintsCopy2;
delete[] uintsCopy;
return 0;
}
int ParallelMergeSortBenchmark(vector<unsigned>& uints)
{
// generate some random uints:
printf("\nBenchmarking Parallel Merge Sort Hybrid with %zu unsigned integers (each of %lu bytes)...\n", uints.size(), (unsigned long)sizeof(unsigned));
unsigned* uintsCopy = new unsigned[uints.size()];
unsigned* uintsCopy2 = new unsigned[uints.size()];
unsigned* sorted = new unsigned[uints.size()];
// time how long it takes to sort them:
for (int i = 0; i < iterationCount; ++i)
{
for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting
uintsCopy[ j] = uints[j];
uintsCopy2[j] = uints[j];
sorted[ j] = (unsigned)j; // page in the destination array into system memory
}
const auto startTime = high_resolution_clock::now();
//ParallelAlgorithms::parallel_merge_sort_hybrid_rh_1(uintsCopy, 0, (int)(uints.size() - 1), sorted); // ParallelMergeSort modifies the source array
ParallelAlgorithms::parallel_merge_sort_hybrid(uintsCopy, (size_t)0, uints.size() - 1, sorted, false); // ParallelMergeSort modifies the source array
//ParallelAlgorithms::parallel_merge_merge_sort_hybrid(uintsCopy, (size_t)0, uints.size() - 1, sorted, false); // ParallelMergeSort modifies the source array
const auto endTime = high_resolution_clock::now();
std::sort(std::execution::par_unseq, uintsCopy2, uintsCopy2 + uints.size());
//std::stable_sort(std::execution::par_unseq, uintsCopy2, uintsCopy2 + uints.size());
//if (std::equal(sorted, sorted + uints.size(), uintsCopy2))
if (std::equal(uintsCopy, uintsCopy + uints.size(), uintsCopy2))
std::cout << "Arrays are equal ";
else
std::cout << "Arrays are not equal ";
print_results("Parallel Merge Sort", sorted, uints.size(), startTime, endTime);
}
delete[] sorted;
delete[] uintsCopy;
return 0;
}
int ParallelMergeBenchmark()
{
const size_t testSize = 10'000'000;
random_device rd;
// generate some random uints:
vector<unsigned> uints_0(testSize);
for (auto& d : uints_0)
d = static_cast<unsigned>(rd());
printf("\nBenchmarking Parallel Merge with %zu unsigned integers (each of %lu bytes)...\n", uints_0.size(), (unsigned long)sizeof(unsigned));
#if 1
sort(std::execution::par_unseq, uints_0.begin(), uints_0.begin() + testSize/2);
sort(std::execution::par_unseq, uints_0.begin() + testSize/2, uints_0.end());
#else
sort(oneapi::dpl::execution::par_unseq, uints_0.begin(), uints_0.begin() + testSize / 2);
sort(oneapi::dpl::execution::par_unseq, uints_0.begin() + testSize / 2, uints_0.end());
#endif
// time how long it takes to merge them them:
for (int i = 0; i < iterationCount; ++i)
{
vector<unsigned> uints_work(uints_0); // copy the original into a working vector, since it's an in-place merge
const auto startTime = high_resolution_clock::now();
std::inplace_merge(std::execution::par_unseq, uints_work.begin(), uints_work.begin() + testSize / 2, uints_work.end());
const auto endTime = high_resolution_clock::now();
print_results("Parallel Merge", uints_work.data(), uints_work.size(), startTime, endTime);
}
// time how long it takes to merge them them:
for (int i = 0; i < iterationCount; ++i)
{
vector<unsigned> uints_work(uints_0); // copy the original into a working vector, since it's an in-place merge
const auto startTime = high_resolution_clock::now();
std::inplace_merge(uints_work.begin(), uints_work.begin() + testSize / 2, uints_work.end());
const auto endTime = high_resolution_clock::now();
print_results("Parallel Merge", uints_work.data(), uints_work.size(), startTime, endTime);
}
return 0;
}