Skip to content

Commit 2ea67f3

Browse files
authored
refactor: replace merge-sort with heapsort (#405)
This change replaces the merge-sort with a heapsort that uses much less CU than merge-sort. The previous algorithm is very fast running on a normal CPU but doesn't work very well in BPF because all instructions (like load, move, ..) have the same cost and there is no cache and optimizations for memory-alignment have no real benefit. The major benefits of heapsort are being non-recursive that reduces the high stackframe overhead in BPF and being inplace which minimizes number of copies. Unfortunately there is no way to systematically get the the compute usage out of program test. The `test_benchmark` file has a simple code that helps running benchmarks on various number of publishers. In 32-publisher setup, heapsort reduces the CU from 16.5k to 12k and in the 64-publisher setup 37k to 20.5k. The numbers are the worst cases running on randomized input. The result of running in a highly similar input (two distinct prices, and two distinct confidences) is 14.7k and 25k respectively, which is still better and is very unlikely in practice.
1 parent 9acf172 commit 2ea67f3

13 files changed

+239
-561
lines changed

program/c/makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ cpyth-native: features.h
4141
test: features.h
4242
mkdir -p $(OUT_DIR)/test/
4343
gcc -c ./src/oracle/model/test_price_model.c -o $(OUT_DIR)/test/test_price_model.o -fPIC
44-
gcc -c ./src/oracle/sort/test_sort_stable.c -o $(OUT_DIR)/test/test_sort_stable.o -fPIC
4544
gcc -c ./src/oracle/util/test_align.c -o $(OUT_DIR)/test/test_align.o -fPIC
4645
gcc -c ./src/oracle/util/test_avg.c -o $(OUT_DIR)/test/test_avg.o -fPIC
4746
gcc -c ./src/oracle/util/test_hash.c -o $(OUT_DIR)/test/test_hash.o -fPIC
Lines changed: 73 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,112 +1,104 @@
11
#include "price_model.h"
22
#include "../util/avg.h" /* For avg_2_int64 */
33

4-
#define SORT_NAME int64_sort_ascending
5-
#define SORT_KEY_T int64_t
6-
#include "../sort/tmpl/sort_stable.c"
4+
/*
5+
* In-place bottom-up Heapsort implementation optimized for minimal compute unit usage in BPF.
6+
*
7+
* Initially it creates a max heap in linear time and then to get ascending
8+
* order it swaps the root with the last element and then sifts down the root.
9+
*
10+
* The number of comparisions in average case is nlgn + O(1) and in worst case is
11+
* 1.5nlgn + O(n).
12+
*
13+
* There are a lot of (j-1) or (j+1) math in the code which can be optimized by
14+
* thinking of a as 1-based array. Fortunately, BPF compiler optimizes that for us.
15+
*/
16+
void heapsort(int64_t * a, uint64_t n) {
17+
if (n <= 1) return;
18+
19+
/*
20+
* This is a bottom-up heapify which is linear in time.
21+
*/
22+
for (uint64_t i = n / 2 - 1;; --i) {
23+
int64_t root = a[i];
24+
uint64_t j = i * 2 + 1;
25+
while (j < n) {
26+
if (j + 1 < n && a[j] < a[j + 1]) ++j;
27+
if (root >= a[j]) break;
28+
a[(j - 1) / 2] = a[j];
29+
j = j * 2 + 1;
30+
}
31+
a[(j - 1) / 2] = root;
32+
33+
if (i == 0) break;
34+
}
35+
36+
for (uint64_t i = n - 1; i > 0; --i) {
37+
int64_t tmp = a[0];
38+
a[0] = a[i];
39+
a[i] = tmp;
40+
41+
int64_t root = a[0];
42+
uint64_t j = 1;
43+
while (j < i) {
44+
if (j + 1 < i && a[j] < a[j + 1]) ++j;
45+
if (root >= a[j]) break;
46+
a[(j - 1) / 2] = a[j];
47+
j = j * 2 + 1;
48+
}
49+
a[(j - 1) / 2] = root;
50+
}
51+
}
752

53+
/*
54+
* Find the 25, 50, and 75 percentiles of the given quotes using heapsort.
55+
*
56+
* This implementation optimizes the price_model_core function for minimal compute unit usage in BPF.
57+
*
58+
* In Solana, each BPF instruction costs 1 unit of compute and is much different than a native code
59+
* execution time. Here are some of the differences:
60+
* 1. There is no cache, so memory access is much more expensive.
61+
* 2. The instruction set is very minimal, and there are only 10 registers available.
62+
* 3. The BPF compiler is not very good at optimizing the code.
63+
* 4. The stack size is limited and having extra stack frame has high overhead.
64+
*
65+
* This implementation is chosen among other implementations such as merge-sort, quick-sort, and quick-select
66+
* because it is very fast, has small number of instructions, and has a very small memory footprint by being
67+
* in-place and is non-recursive and has a nlogn worst-case time complexity.
68+
*/
869
int64_t *
970
price_model_core( uint64_t cnt,
1071
int64_t * quote,
1172
int64_t * _p25,
1273
int64_t * _p50,
13-
int64_t * _p75,
14-
void * scratch ) {
15-
16-
/* Sort the quotes. The sorting implementation used here is a highly
17-
optimized mergesort (merge with an unrolled insertion sorting
18-
network small n base cases). The best case is ~0.5 n lg n compares
19-
and the average and worst cases are ~n lg n compares.
20-
21-
While not completely data oblivious, this has quite low variance in
22-
operation count practically and this is _better_ than quicksort's
23-
average case and quicksort's worst case is a computational
24-
denial-of-service and timing attack vulnerable O(n^2). Unlike
25-
quicksort, this is also stable (but this stability does not
26-
currently matter ... it might be a factor in future models).
27-
28-
A data oblivious sorting network approach might be viable here with
29-
and would have a completely deterministic operations count. It
30-
currently isn't used as the best known practical approaches for
31-
general n have a worse algorithmic cost (O( n (lg n)^2 )) and,
32-
while the application probably doesn't need perfect obliviousness,
33-
mergesort is still moderately oblivious and the application can
34-
benefit from mergesort's lower operations cost. (The main drawback
35-
of mergesort over quicksort is that it isn't in place, but memory
36-
footprint isn't an issue here.)
37-
38-
Given the operations cost model (e.g. cache friendliness is not
39-
incorporated), a radix sort might be viable here (O(n) in best /
40-
average / worst). It currently isn't used as we expect invocations
41-
with small-ish n to be common and radix sort would be have large
42-
coefficients on the O(n) and additional fixed overheads that would
43-
make it more expensive than mergesort in this regime.
44-
45-
Note: price_model_cnt_valid( cnt ) implies
46-
int64_sort_ascending_cnt_valid( cnt ) currently.
47-
48-
Note: consider filtering out "NaN" quotes (i.e. INT64_MIN)? */
49-
50-
int64_t * sort_quote = int64_sort_ascending_stable( quote, cnt, scratch );
51-
52-
/* Extract the p25
53-
54-
There are many variants with subtle tradeoffs here. One option is
55-
to interpolate when the ideal p25 is bracketed by two samples (akin
56-
to the p50 interpolation above when the number of quotes is even).
57-
That is, for p25, interpolate between quotes floor((cnt-2)/4) and
58-
ceil((cnt-2)/4) with the weights determined by cnt mod 4. The
59-
current preference is to not do that as it is slightly more
60-
complex, doesn't exactly always minimize the current loss function
61-
and is more exposed to the confidence intervals getting skewed by
62-
bum quotes with the number of quotes is small.
63-
64-
Another option is to use the inside quote of the above pair. That
65-
is, for p25, use quote ceil((cnt-2)/4) == floor((cnt+1)/4) ==
66-
(cnt+1)>>2. The current preference is not to do this as, though
67-
this has stronger bum quote robustness, it results in p25==p50==p75
68-
when cnt==3. (In this case, the above wants to do an interpolation
69-
between quotes 0 and 1 to for the p25 and between quotes 1 and 2
70-
for the p75. But limiting to just the inside quote results in
71-
p25/p50/p75 all using the median quote.)
72-
73-
A tweak to this option, for p25, is to use floor(cnt/4) == cnt>>2.
74-
This is simple, has the same asymptotic behavior for large cnt, has
75-
good behavior in the cnt==3 case and practically as good bum quote
76-
rejection in the moderate cnt case. */
74+
int64_t * _p75) {
75+
heapsort(quote, cnt);
7776

77+
/* Extract the p25 */
7878
uint64_t p25_idx = cnt >> 2;
79-
80-
*_p25 = sort_quote[p25_idx];
79+
*_p25 = quote[p25_idx];
8180

8281
/* Extract the p50 */
83-
8482
if( (cnt & (uint64_t)1) ) { /* Odd number of quotes */
85-
8683
uint64_t p50_idx = cnt >> 1; /* ==ceil((cnt-1)/2) */
87-
88-
*_p50 = sort_quote[p50_idx];
89-
84+
*_p50 = quote[p50_idx];
9085
} else { /* Even number of quotes (at least 2) */
91-
9286
uint64_t p50_idx_right = cnt >> 1; /* == ceil((cnt-1)/2)> 0 */
9387
uint64_t p50_idx_left = p50_idx_right - (uint64_t)1; /* ==floor((cnt-1)/2)>=0 (no overflow/underflow) */
9488

95-
int64_t vl = sort_quote[p50_idx_left ];
96-
int64_t vr = sort_quote[p50_idx_right];
89+
int64_t vl = quote[p50_idx_left];
90+
int64_t vr = quote[p50_idx_right];
9791

9892
/* Compute the average of vl and vr (with floor / round toward
9993
negative infinity rounding and without possibility of
10094
intermediate overflow). */
101-
10295
*_p50 = avg_2_int64( vl, vr );
10396
}
10497

10598
/* Extract the p75 (this is the mirror image of the p25 case) */
10699

107100
uint64_t p75_idx = cnt - ((uint64_t)1) - p25_idx;
101+
*_p75 = quote[p75_idx];
108102

109-
*_p75 = sort_quote[p75_idx];
110-
111-
return sort_quote;
103+
return quote;
112104
}

program/c/src/oracle/model/price_model.h

Lines changed: 11 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -8,91 +8,20 @@
88
extern "C" {
99
#endif
1010

11-
/* Returns the minimum and maximum number of quotes the implementation
12-
can handle */
13-
14-
static inline uint64_t
15-
price_model_quote_min( void ) {
16-
return (uint64_t)1;
17-
}
18-
19-
static inline uint64_t
20-
price_model_quote_max( void ) {
21-
return (UINT64_MAX-(uint64_t)alignof(int64_t)+(uint64_t)1) / (uint64_t)sizeof(int64_t);
22-
}
23-
24-
/* price_model_cnt_valid returns non-zero if cnt is a valid value or
25-
zero if not. */
26-
27-
static inline int
28-
price_model_cnt_valid( uint64_t cnt ) {
29-
return price_model_quote_min()<=cnt && cnt<=price_model_quote_max();
30-
}
31-
32-
/* price_model_scratch_footprint returns the number of bytes of scratch
33-
space needed for an arbitrarily aligned scratch region required by
34-
price_model to handle price_model_quote_min() to cnt quotes
35-
inclusive. */
36-
37-
static inline uint64_t
38-
price_model_scratch_footprint( uint64_t cnt ) { /* Assumes price_model_cnt_valid( cnt ) is true */
39-
/* cnt int64_t's plus worst case alignment padding, no overflow
40-
possible as cnt is valid at this point */
41-
return cnt*(uint64_t)sizeof(int64_t)+(uint64_t)alignof(int64_t)-(uint64_t)1;
42-
}
43-
44-
/* price_model_core minimizes (to quote precision in a floor / round
45-
toward negative infinity sense) the loss model of the given quotes.
46-
Assumes valid inputs (e.g. cnt is at least 1 and not unreasonably
47-
large ... typically a multiple of 3 but this is not required,
48-
quote[i] for i in [0,cnt) are the quotes of interest on input, p25,
49-
p50, p75 point to where to write model outputs, scratch points to a
50-
suitable footprint scratch region).
51-
52-
Returns a pointer to the quotes sorted in ascending order. As such,
53-
the min and max and any other rank statistic can be extracted easily
54-
on return. This location will either be quote itself or to a
55-
location in scratch. Use price_model below for a variant that always
56-
replaces quote with the sorted quotes (potentially has extra ops for
57-
copying). Further, on return, *_p25, *_p50, *_p75 will hold the loss
58-
model minimizing values for the input quotes and the scratch region
59-
was clobbered.
60-
61-
Scratch points to a memory region of arbitrary alignment with at
62-
least price_model_scratch_footprint( cnt ) bytes and it will be
63-
clobbered on output. It is sufficient to use a normally aligned /
64-
normally allocated / normally declared array of cnt int64_t's.
65-
66-
The cost of this function is a fast and low variance (but not
67-
completely data oblivious) O(cnt lg cnt) in the best / average /
68-
worst cases. This function uses no heap / dynamic memory allocation.
69-
It is thread safe provided it passed non-conflicting quote, output
70-
and scratch arrays. It has a bounded call depth ~lg cnt <= ~64 (this
71-
could reduce to O(1) by using a non-recursive sort/select
72-
implementation under the hood if desired). */
73-
74-
int64_t * /* Returns pointer to sorted quotes (either quote or ALIGN_UP(scratch,int64_t)) */
75-
price_model_core( uint64_t cnt, /* Assumes price_model_cnt_valid( cnt ) is true */
11+
/*
12+
* This function computes the p25, p50 and p75 percentiles of the given quotes and
13+
* writes them to the given pointers. It also returns the sorted quotes array. Being
14+
* sorted is not necessary for this model to work, and is only relied upon by the
15+
* tests to verify the correctness of the model with more confidence.
16+
*
17+
* The quote array might get modified by this function.
18+
*/
19+
int64_t *
20+
price_model_core( uint64_t cnt, /* Number of elements in quote */
7621
int64_t * quote, /* Assumes quote[i] for i in [0,cnt) is the i-th quote on input */
7722
int64_t * _p25, /* Assumes *_p25 is safe to write to the p25 model output */
7823
int64_t * _p50, /* Assumes *_p50 " */
79-
int64_t * _p75, /* Assumes *_p75 " */
80-
void * scratch ); /* Assumes a suitable scratch region */
81-
82-
/* Same as the above but always returns quote and quote always holds the
83-
sorted quotes on return. */
84-
85-
static inline int64_t *
86-
price_model( uint64_t cnt,
87-
int64_t * quote,
88-
int64_t * _p25,
89-
int64_t * _p50,
90-
int64_t * _p75,
91-
void * scratch ) {
92-
int64_t * tmp = price_model_core( cnt, quote, _p25, _p50, _p75, scratch );
93-
if( tmp!=quote ) for( uint64_t idx=(uint64_t)0; idx<cnt; idx++ ) quote[ idx ] = tmp[ idx ];
94-
return quote;
95-
}
24+
int64_t * _p75); /* Assumes *_p75 " */
9625

9726
#ifdef __cplusplus
9827
}

program/c/src/oracle/model/test_price_model.c

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,38 @@ int test_price_model() {
1919
prng_t _prng[1];
2020
prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
2121

22-
# define N 96
22+
# define N 192
2323

2424
int64_t quote0 [N];
2525
int64_t quote [N];
2626
int64_t val [3];
27-
int64_t scratch[N];
2827

28+
/* Brute force validate small sizes via the 0-1 principle. */
29+
for( int cnt=0; cnt<=24; cnt++ ) {
30+
for( long mask=0L; mask<(1L<<cnt); mask++ ) {
31+
for( int i=0; i<cnt; i++ ) quote0[i] = (int64_t) ((mask>>i) & 1L);
32+
33+
memcpy( quote, quote0, sizeof(int64_t)*(size_t)cnt );
34+
if( price_model_core( cnt, quote, val+0, val+1, val+2)!=quote ) { printf( "FAIL (01-compose)\n" ); return 1; }
35+
36+
/* Validate the results */
37+
38+
/* Although being sorted is not necessary it gives us more confidence about the correctness of the model */
39+
qsort( quote0, (size_t)cnt, sizeof(int64_t), qcmp );
40+
if( memcmp( quote, quote0, sizeof(int64_t)*(size_t)cnt ) ) { printf( "FAIL (01-sort)\n" ); return 1; }
41+
42+
uint64_t p25_idx = cnt>>2;
43+
uint64_t p50_idx = cnt>>1;
44+
uint64_t p75_idx = cnt - (uint64_t)1 - p25_idx;
45+
uint64_t is_even = (uint64_t)!(cnt & (uint64_t)1);
46+
47+
if( val[0]!=quote[ p25_idx ] ) { printf( "FAIL (01-p25)\n" ); return 1; }
48+
if( val[1]!=avg_2_int64( quote[ p50_idx-is_even ], quote[ p50_idx ] ) ) { printf( "FAIL (01-p50)\n" ); return 1; }
49+
if( val[2]!=quote[ p75_idx ] ) { printf( "FAIL (01-p75)\n" ); return 1; }
50+
}
51+
}
52+
53+
/* Test using randomized inputs */
2954
for( int iter=0; iter<10000000; iter++ ) {
3055

3156
/* Generate a random test */
@@ -36,10 +61,11 @@ int test_price_model() {
3661
/* Apply the model */
3762

3863
memcpy( quote, quote0, sizeof(int64_t)*(size_t)cnt );
39-
if( price_model( cnt, quote, val+0, val+1, val+2, scratch )!=quote ) { printf( "FAIL (compose)\n" ); return 1; }
64+
if( price_model_core( cnt, quote, val+0, val+1, val+2)!=quote ) { printf( "FAIL (compose)\n" ); return 1; }
4065

4166
/* Validate the results */
4267

68+
/* Although being sorted is not necessary it gives us more confidence about the correctness of the model */
4369
qsort( quote0, (size_t)cnt, sizeof(int64_t), qcmp );
4470
if( memcmp( quote, quote0, sizeof(int64_t)*(size_t)cnt ) ) { printf( "FAIL (sort)\n" ); return 1; }
4571

0 commit comments

Comments
 (0)