|
1 | 1 | #include "price_model.h"
|
2 | 2 | #include "../util/avg.h" /* For avg_2_int64 */
|
3 | 3 |
|
4 |
| -/* |
5 |
| - * In-place bottom-up Heapsort implementation optimized for minimal compute unit usage in BPF. |
6 |
| - * |
7 |
| - * Initially it creates a max heap in linear time and then to get ascending |
8 |
| - * order it swaps the root with the last element and then sifts down the root. |
9 |
| - * |
10 |
| - * The number of comparisions in average case is nlgn + O(1) and in worst case is |
11 |
| - * 1.5nlgn + O(n). |
12 |
| - * |
13 |
| - * There are a lot of (j-1) or (j+1) math in the code which can be optimized by |
14 |
| - * thinking of a as 1-based array. Fortunately, BPF compiler optimizes that for us. |
15 |
| - */ |
16 |
| -void heapsort(int64_t * a, uint64_t n) { |
17 |
| - if (n <= 1) return; |
18 |
| - |
19 |
| - /* |
20 |
| - * This is a bottom-up heapify which is linear in time. |
21 |
| - */ |
22 |
| - for (uint64_t i = n / 2 - 1;; --i) { |
23 |
| - int64_t root = a[i]; |
24 |
| - uint64_t j = i * 2 + 1; |
25 |
| - while (j < n) { |
26 |
| - if (j + 1 < n && a[j] < a[j + 1]) ++j; |
27 |
| - if (root >= a[j]) break; |
28 |
| - a[(j - 1) / 2] = a[j]; |
29 |
| - j = j * 2 + 1; |
30 |
| - } |
31 |
| - a[(j - 1) / 2] = root; |
32 |
| - |
33 |
| - if (i == 0) break; |
34 |
| - } |
35 |
| - |
36 |
| - for (uint64_t i = n - 1; i > 0; --i) { |
37 |
| - int64_t tmp = a[0]; |
38 |
| - a[0] = a[i]; |
39 |
| - a[i] = tmp; |
40 |
| - |
41 |
| - int64_t root = a[0]; |
42 |
| - uint64_t j = 1; |
43 |
| - while (j < i) { |
44 |
| - if (j + 1 < i && a[j] < a[j + 1]) ++j; |
45 |
| - if (root >= a[j]) break; |
46 |
| - a[(j - 1) / 2] = a[j]; |
47 |
| - j = j * 2 + 1; |
48 |
| - } |
49 |
| - a[(j - 1) / 2] = root; |
50 |
| - } |
51 |
| -} |
| 4 | +#define SORT_NAME int64_sort_ascending |
| 5 | +#define SORT_KEY_T int64_t |
| 6 | +#include "../sort/tmpl/sort_stable.c" |
52 | 7 |
|
53 |
| -/* |
54 |
| - * Find the 25, 50, and 75 percentiles of the given quotes using heapsort. |
55 |
| - * |
56 |
| - * This implementation optimizes the price_model_core function for minimal compute unit usage in BPF. |
57 |
| - * |
58 |
| - * In Solana, each BPF instruction costs 1 unit of compute and is much different than a native code |
59 |
| - * execution time. Here are some of the differences: |
60 |
| - * 1. There is no cache, so memory access is much more expensive. |
61 |
| - * 2. The instruction set is very minimal, and there are only 10 registers available. |
62 |
| - * 3. The BPF compiler is not very good at optimizing the code. |
63 |
| - * 4. The stack size is limited and having extra stack frame has high overhead. |
64 |
| - * |
65 |
| - * This implementation is chosen among other implementations such as merge-sort, quick-sort, and quick-select |
66 |
| - * because it is very fast, has small number of instructions, and has a very small memory footprint by being |
67 |
| - * in-place and is non-recursive and has a nlogn worst-case time complexity. |
68 |
| - */ |
69 | 8 | int64_t *
|
70 | 9 | price_model_core( uint64_t cnt,
|
71 | 10 | int64_t * quote,
|
72 | 11 | int64_t * _p25,
|
73 | 12 | int64_t * _p50,
|
74 |
| - int64_t * _p75) { |
75 |
| - heapsort(quote, cnt); |
| 13 | + int64_t * _p75, |
| 14 | + void * scratch ) { |
| 15 | + |
| 16 | + /* Sort the quotes. The sorting implementation used here is a highly |
| 17 | + optimized mergesort (merge with an unrolled insertion sorting |
| 18 | + network small n base cases). The best case is ~0.5 n lg n compares |
| 19 | + and the average and worst cases are ~n lg n compares. |
| 20 | +
|
| 21 | + While not completely data oblivious, this has quite low variance in |
| 22 | + operation count practically and this is _better_ than quicksort's |
| 23 | + average case and quicksort's worst case is a computational |
| 24 | + denial-of-service and timing attack vulnerable O(n^2). Unlike |
| 25 | + quicksort, this is also stable (but this stability does not |
| 26 | + currently matter ... it might be a factor in future models). |
| 27 | +
|
| 28 | + A data oblivious sorting network approach might be viable here with |
| 29 | + and would have a completely deterministic operations count. It |
| 30 | + currently isn't used as the best known practical approaches for |
| 31 | + general n have a worse algorithmic cost (O( n (lg n)^2 )) and, |
| 32 | + while the application probably doesn't need perfect obliviousness, |
| 33 | + mergesort is still moderately oblivious and the application can |
| 34 | + benefit from mergesort's lower operations cost. (The main drawback |
| 35 | + of mergesort over quicksort is that it isn't in place, but memory |
| 36 | + footprint isn't an issue here.) |
| 37 | +
|
| 38 | + Given the operations cost model (e.g. cache friendliness is not |
| 39 | + incorporated), a radix sort might be viable here (O(n) in best / |
| 40 | + average / worst). It currently isn't used as we expect invocations |
| 41 | + with small-ish n to be common and radix sort would be have large |
| 42 | + coefficients on the O(n) and additional fixed overheads that would |
| 43 | + make it more expensive than mergesort in this regime. |
| 44 | +
|
| 45 | + Note: price_model_cnt_valid( cnt ) implies |
| 46 | + int64_sort_ascending_cnt_valid( cnt ) currently. |
| 47 | +
|
| 48 | + Note: consider filtering out "NaN" quotes (i.e. INT64_MIN)? */ |
| 49 | + |
| 50 | + int64_t * sort_quote = int64_sort_ascending_stable( quote, cnt, scratch ); |
| 51 | + |
| 52 | + /* Extract the p25 |
| 53 | +
|
| 54 | + There are many variants with subtle tradeoffs here. One option is |
| 55 | + to interpolate when the ideal p25 is bracketed by two samples (akin |
| 56 | + to the p50 interpolation above when the number of quotes is even). |
| 57 | + That is, for p25, interpolate between quotes floor((cnt-2)/4) and |
| 58 | + ceil((cnt-2)/4) with the weights determined by cnt mod 4. The |
| 59 | + current preference is to not do that as it is slightly more |
| 60 | + complex, doesn't exactly always minimize the current loss function |
| 61 | + and is more exposed to the confidence intervals getting skewed by |
| 62 | + bum quotes with the number of quotes is small. |
| 63 | +
|
| 64 | + Another option is to use the inside quote of the above pair. That |
| 65 | + is, for p25, use quote ceil((cnt-2)/4) == floor((cnt+1)/4) == |
| 66 | + (cnt+1)>>2. The current preference is not to do this as, though |
| 67 | + this has stronger bum quote robustness, it results in p25==p50==p75 |
| 68 | + when cnt==3. (In this case, the above wants to do an interpolation |
| 69 | + between quotes 0 and 1 to for the p25 and between quotes 1 and 2 |
| 70 | + for the p75. But limiting to just the inside quote results in |
| 71 | + p25/p50/p75 all using the median quote.) |
| 72 | +
|
| 73 | + A tweak to this option, for p25, is to use floor(cnt/4) == cnt>>2. |
| 74 | + This is simple, has the same asymptotic behavior for large cnt, has |
| 75 | + good behavior in the cnt==3 case and practically as good bum quote |
| 76 | + rejection in the moderate cnt case. */ |
76 | 77 |
|
77 |
| - /* Extract the p25 */ |
78 | 78 | uint64_t p25_idx = cnt >> 2;
|
79 |
| - *_p25 = quote[p25_idx]; |
| 79 | + |
| 80 | + *_p25 = sort_quote[p25_idx]; |
80 | 81 |
|
81 | 82 | /* Extract the p50 */
|
| 83 | + |
82 | 84 | if( (cnt & (uint64_t)1) ) { /* Odd number of quotes */
|
| 85 | + |
83 | 86 | uint64_t p50_idx = cnt >> 1; /* ==ceil((cnt-1)/2) */
|
84 |
| - *_p50 = quote[p50_idx]; |
| 87 | + |
| 88 | + *_p50 = sort_quote[p50_idx]; |
| 89 | + |
85 | 90 | } else { /* Even number of quotes (at least 2) */
|
| 91 | + |
86 | 92 | uint64_t p50_idx_right = cnt >> 1; /* == ceil((cnt-1)/2)> 0 */
|
87 | 93 | uint64_t p50_idx_left = p50_idx_right - (uint64_t)1; /* ==floor((cnt-1)/2)>=0 (no overflow/underflow) */
|
88 | 94 |
|
89 |
| - int64_t vl = quote[p50_idx_left]; |
90 |
| - int64_t vr = quote[p50_idx_right]; |
| 95 | + int64_t vl = sort_quote[p50_idx_left ]; |
| 96 | + int64_t vr = sort_quote[p50_idx_right]; |
91 | 97 |
|
92 | 98 | /* Compute the average of vl and vr (with floor / round toward
|
93 | 99 | negative infinity rounding and without possibility of
|
94 | 100 | intermediate overflow). */
|
| 101 | + |
95 | 102 | *_p50 = avg_2_int64( vl, vr );
|
96 | 103 | }
|
97 | 104 |
|
98 | 105 | /* Extract the p75 (this is the mirror image of the p25 case) */
|
99 | 106 |
|
100 | 107 | uint64_t p75_idx = cnt - ((uint64_t)1) - p25_idx;
|
101 |
| - *_p75 = quote[p75_idx]; |
102 | 108 |
|
103 |
| - return quote; |
| 109 | + *_p75 = sort_quote[p75_idx]; |
| 110 | + |
| 111 | + return sort_quote; |
104 | 112 | }
|
0 commit comments