Skip to content

Commit b43ac34

Browse files
kbowers-jumpasiegel-jt
authored andcommitted
Parallelization tune up and other cleanups
Polished up some thread parallelism rough edges noted in the recent parallel thread sorting API installation. Significant performance improvement for thread parallelism strong scaling (not that it wasn't already very fast) and some low impact tpool API changes from user point of view for existing usages (that make it easier to use and more flexible to deploy): - Improved cache traffic between dispatch and worker threads in tpool. Thread dispatch / sync is low tens of percent faster (from ~400 ns per level to ~300 ns per level by reducing cache contention). A side effect of this is that the tpool state variable was split into two sequence number variables. This eliminated the FD_TPOOL_WORKER_STATE_* defines and corresponding fd_tpool_state_cstr APIs. It also replaced the fd_tpool_worker_state with a simpler to use fd_tpool_worker_idle API. - Added an run-time option for tpool threads to sleep when idle instead of spinning when idle. This option slows down thread dispatch by over an order of magnitude (e.g. ~28 us vs ~1.8 us to dispatch and synchronize 64 threads instead of ~1.8 us on typical x86 server ... the OS does a _terrible_ job at parallelism, especially at high core count). But this is helpful for various kinds of testing and for situations where low latency / strong scaling is not a concern (especially with floating threads, low priority background processing, etc). This added a new opt argument to fd_tool_init. - Polished up FD_MAP_REDUCE / FD_FOR_ALL. These now support passing more arguments, more flexibily and more simply than previously. This tweaked how arguments are presented to implementation (simplifying practically, no more LOC than required for writing a normal function input argument list). The dispatch logic was similarly tuned to take advantage of the above. - Removed dependence on fd_scratch APIs from tpool. This simplifies API usage and removes the scratch / scratch_sz argument from fd_tpool_worker_push. At this point, suspect the fd_scratch API can be fully eliminated in favor of spad. - Gave test_tpool the ability to select spin or sleeping style from command line and made minor improvement to the accuracy of the dispatch performance model in the various benchmarks. - Did a linting pass on fd_util.h and related includes to reflect current util API organization.
1 parent dadcd1b commit b43ac34

File tree

18 files changed

+983
-870
lines changed

18 files changed

+983
-870
lines changed

src/app/ledger/main.c

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -221,12 +221,12 @@ init_tpool( fd_ledger_args_t * ledger_args ) {
221221

222222
ulong start_idx = 1UL;
223223
if( tcnt>=1UL ) {
224-
tpool = fd_tpool_init( ledger_args->tpool_mem, tcnt );
224+
tpool = fd_tpool_init( ledger_args->tpool_mem, tcnt, 0UL );
225225
if( tpool == NULL ) {
226226
FD_LOG_ERR(( "failed to create thread pool" ));
227227
}
228228
for( ulong i=1UL; i<tcnt; ++i ) {
229-
if( fd_tpool_worker_push( tpool, start_idx++, NULL, 0UL ) == NULL ) {
229+
if( fd_tpool_worker_push( tpool, start_idx++ ) == NULL ) {
230230
FD_LOG_ERR(( "failed to launch worker" ));
231231
}
232232
else {
@@ -249,25 +249,24 @@ init_tpool( fd_ledger_args_t * ledger_args ) {
249249
FD_LOG_ERR(( "This is an invalid value for the number of threads to use for snapshot creation" ));
250250
}
251251

252-
fd_tpool_t * snapshot_bg_tpool = fd_tpool_init( ledger_args->tpool_mem_snapshot_bg, snapshot_tcnt );
253-
if( FD_UNLIKELY( !fd_tpool_worker_push( snapshot_bg_tpool, start_idx++, NULL, 0UL ) ) ) {
252+
fd_tpool_t * snapshot_bg_tpool = fd_tpool_init( ledger_args->tpool_mem_snapshot_bg, snapshot_tcnt, 0UL );
253+
if( FD_UNLIKELY( !fd_tpool_worker_push( snapshot_bg_tpool, start_idx++ ) ) ) {
254254
FD_LOG_ERR(( "failed to launch worker" ));
255255
} else {
256256
FD_LOG_NOTICE(( "launched snapshot bg worker %lu", start_idx - 1UL ));
257257
}
258258

259259
ledger_args->snapshot_bg_tpool = snapshot_bg_tpool;
260260

261-
262261
if( snapshot_tcnt==2UL ) {
263262
return 0;
264263
}
265264

266265
/* If a snapshot is being created, setup its own tpool. */
267266

268-
fd_tpool_t * snapshot_tpool = fd_tpool_init( ledger_args->tpool_mem_snapshot, snapshot_tcnt - 1UL );
267+
fd_tpool_t * snapshot_tpool = fd_tpool_init( ledger_args->tpool_mem_snapshot, snapshot_tcnt - 1UL, 0UL );
269268
for( ulong i=1UL; i<snapshot_tcnt - 1UL; ++i ) {
270-
if( FD_UNLIKELY( !fd_tpool_worker_push( snapshot_tpool, start_idx++, NULL, 0UL ) ) ) {
269+
if( FD_UNLIKELY( !fd_tpool_worker_push( snapshot_tpool, start_idx++ ) ) ) {
271270
FD_LOG_ERR(( "failed to launch worker" ));
272271
} else {
273272
FD_LOG_NOTICE(( "launched snapshot hash worker %lu", start_idx - 1UL ));
@@ -1251,7 +1250,6 @@ replay( fd_ledger_args_t * args ) {
12511250

12521251
FD_SPAD_FRAME_BEGIN( spad ) {
12531252

1254-
12551253
/* Setup slot_ctx */
12561254
fd_funk_t * funk = args->funk;
12571255

src/discof/consensus/test_consensus.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -892,7 +892,7 @@ main( void ) {
892892
// uchar tpool_mem[FD_TPOOL_FOOTPRINT( FD_TILE_MAX )]__attribute__((aligned(FD_TPOOL_ALIGN))) = { 0 };
893893
// /* clang-format on */
894894
// if( tile_cnt > 4 ) {
895-
// tpool = fd_tpool_init( tpool_mem, tile_cnt );
895+
// tpool = fd_tpool_init( tpool_mem, tile_cnt, 0UL );
896896
// FD_TEST( tpool );
897897
// if( tpool == NULL ) FD_LOG_ERR( ( "failed to create thread pool" ) );
898898
// ulong scratch_sz = fd_scratch_smem_footprint( 256 << 20 );

src/flamenco/runtime/fd_runtime.c

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,6 @@ fd_runtime_validate_fee_collector( fd_exec_slot_ctx_t const * slot_ctx,
281281
return 0UL;
282282
}
283283

284-
285284
static int
286285
fd_runtime_run_incinerator( fd_exec_slot_ctx_t * slot_ctx ) {
287286
FD_TXN_ACCOUNT_DECL( rec );
@@ -658,7 +657,6 @@ fd_runtime_collect_from_existing_account( ulong slot,
658657
#undef COLLECT_RENT
659658
}
660659

661-
662660
/* fd_runtime_collect_rent_from_account performs rent collection duties.
663661
Although the Solana runtime prevents the creation of new accounts
664662
that are subject to rent, some older accounts are still undergo the
@@ -1391,7 +1389,6 @@ fd_runtime_prepare_txns_start( fd_exec_slot_ctx_t * slot_ctx,
13911389

13921390
fd_rawtxn_b_t raw_txn = { .raw = txn->payload, .txn_sz = (ushort)txn->payload_sz };
13931391

1394-
13951392
task_info[txn_idx].txn_ctx->spad = runtime_spad;
13961393
task_info[txn_idx].txn_ctx->spad_wksp = fd_wksp_containing( runtime_spad );
13971394
int err = fd_execute_txn_prepare_start( slot_ctx,
@@ -1837,8 +1834,7 @@ fd_runtime_process_txns_in_microblock_stream( fd_exec_slot_ctx_t * slot_ctx,
18371834
if( curr_exec_idx>=txn_cnt ) {
18381835
break;
18391836
}
1840-
int state = fd_tpool_worker_state( tpool, worker_idx );
1841-
if( state!=FD_TPOOL_WORKER_STATE_IDLE ) {
1837+
if( !fd_tpool_worker_idle( tpool, worker_idx ) ) {
18421838
continue;
18431839
}
18441840

src/flamenco/runtime/tests/harness/fd_block_harness.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -374,8 +374,8 @@ fd_runtime_fuzz_block_ctx_exec( fd_runtime_fuzz_runner_t * runner,
374374
/* Initialize tpool and spad(s) */
375375
ulong worker_max = FD_BLOCK_HARNESS_TPOOL_WORKER_CNT;
376376
void * tpool_mem = fd_spad_alloc( runner->spad, FD_TPOOL_ALIGN, FD_TPOOL_FOOTPRINT( worker_max ) );
377-
fd_tpool_t * tpool = fd_tpool_init( tpool_mem, worker_max );
378-
fd_tpool_worker_push( tpool, 1UL, NULL, 0UL );
377+
fd_tpool_t * tpool = fd_tpool_init( tpool_mem, worker_max, 0UL );
378+
fd_tpool_worker_push( tpool, 1UL );
379379

380380
fd_spad_t * runtime_spad = runner->spad;
381381

src/util/alloc/fd_alloc.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@
120120
time to try to bound the amount of pre-allocation for small requests. */
121121

122122
#include "../wksp/fd_wksp.h"
123-
#include "../valloc/fd_valloc.h"
124123

125124
/* FD_ALLOC_{ALIGN,FOOTPRINT} give the required alignment and footprint
126125
needed for a wksp allocation to be suitable as a fd_alloc. ALIGN is
@@ -633,4 +632,3 @@ fd_alloc_virtual( fd_alloc_t * alloc ) {
633632
FD_PROTOTYPES_END
634633

635634
#endif /* HEADER_fd_src_util_alloc_fd_alloc_h */
636-

src/util/archive/fd_tar.h

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
format. This is not a general-purpose TAR implementation. It is
66
currently only intended for loading and writing Solana snapshots. */
77

8-
#include "../fd_util_base.h"
98
#include "../io/fd_io.h"
109

1110
/* File Format ********************************************************/
@@ -84,7 +83,7 @@ fd_tar_set_octal( char buf[ static 12 ],
8483
ulong val );
8584

8685
/* fd_tar_meta_set_size sets the size field. Returns 1 on success, 0
87-
if sz is too large to be represented in TAR header. Set size using the
86+
if sz is too large to be represented in TAR header. Set size using the
8887
OLDGNU size extension to allow for unlimited file sizes. The first byte
8988
must be 0x80 followed by 0s and then the size in binary. */
9089

@@ -240,30 +239,30 @@ fd_tar_read( void * reader,
240239
2. Write out file data with fd_tar_writer_write_file_data( writer, data, data_sz ).
241240
This can be done as many times as you want.
242241
3. Finish the current file with fd_tar_writer_fini_file( writer ).
243-
244-
When you are done, call fd_tar_writer_delete( writer ) to write out the
242+
243+
When you are done, call fd_tar_writer_delete( writer ) to write out the
245244
tar archive trailer and close otu the file descriptor.
246245
247-
If you want to reserve space for an existing file and write back to it
246+
If you want to reserve space for an existing file and write back to it
248247
at some point in the future see the below comments for
249248
fd_tar_writer_{make,fill}_space().
250-
249+
251250
*/
252251

253252
struct fd_tar_writer {
254253
int fd; /* The file descriptor for the tar archive. */
255254
ulong header_pos; /* The position in the file for the current files header.
256-
If there is no current file that is being streamed out,
255+
If there is no current file that is being streamed out,
257256
the header_pos will be equal to ULONG_MAX. */
258257
ulong data_sz; /* The size of the current files data. If there is no
259258
current file that is being streamed out, the data_sz
260259
will be equal to ULONG_MAX. */
261260
ulong wb_pos; /* If this value is not equal to ULONG_MAX that means that
262-
this is the position at which to write back to with a
261+
this is the position at which to write back to with a
263262
call to fd_tar_writer_fill_space. */
264-
/* TODO: Right now, the stream to the tar writer just uses fd_io_write.
263+
/* TODO: Right now, the stream to the tar writer just uses fd_io_write.
265264
This can eventually be abstracted to use write callbacks that use
266-
fd_io streaming under the hood. This adds some additional complexity
265+
fd_io streaming under the hood. This adds some additional complexity
267266
that's related to writing back into the header: if the header is still
268267
in the ostream buf, modify the buffer. Otherwise, read the header
269268
directly from the file. */
@@ -333,15 +332,15 @@ fd_tar_writer_fini_file( fd_tar_writer_t * writer );
333332
/* fd_tar_writer_make_space and fd_tar_writer_fill_space, allow for writing
334333
back to a specific place in the tar stream. This can be used by first
335334
making a call to fd_tar_write_new_file, fd_tar_writer_make_space, and
336-
fd_tar_writer_fini_file. This will populate the header and write out
335+
fd_tar_writer_fini_file. This will populate the header and write out
337336
random bytes. The start of this data file will be saved by the tar writer.
338-
Up to n data files can be appended to the tar archive before a call to
337+
Up to n data files can be appended to the tar archive before a call to
339338
fd_tar_writer_fill_space. fd_tar_writer_fill_space should only be called
340339
after an unpaired call to fd_tar_writer_make_space and it requires a valid
341340
fd_tar_writer_t handle. It allows the user to write back to the point at
342341
which they made space. _make_space and _fill_space should be paired together.
343342
There can only be one oustanding call to make_space at a time.
344-
343+
345344
TODO: This can be extended to support multiple write backs. */
346345

347346
int

src/util/bits/fd_bits.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
/* Bit manipulation APIs */
55

6-
#include "../sanitize/fd_msan.h"
6+
#include "../sanitize/fd_sanitize.h"
77

88
FD_PROTOTYPES_BEGIN
99

src/util/fd_util.h

Lines changed: 41 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,50 @@
11
#ifndef HEADER_fd_src_util_fd_util_h
22
#define HEADER_fd_src_util_fd_util_h
33

4-
//#include "fd_util_base.h" /* includes stdalign.h, string.h, limits.h, float.h */
5-
//#include "sanitize/fd_msan.h" /* includes fd_util_base.h */
6-
//#include "bits/fd_bits.h" /* includes sanitize/fd_msan.h */
7-
//#include "sanitize/fd_asan.h" /* includes fd_util_base.h" */
8-
//#include "sanitize/fd_msan.h" /* includes fd_util_base.h" */
9-
//#include "sanitize/fd_sanitize.h" /* includes sanitize/fd_asan.h sanitize/fd_msan.h */
10-
//#include "io/fd_io.h" /* includes bits/fd_bits.h */
11-
#include "spad/fd_spad.h" /* includes bits/fd_bits.h */
12-
//#include "cstr/fd_cstr.h" /* includes bits/fd_bits.h */
13-
//#include "pod/fd_pod.h" /* includes cstr/fd_cstr.h */
14-
//#include "env/fd_env.h" /* includes cstr/fd_cstr.h */
15-
//#include "log/fd_log.h" /* includes env/fd_env.h io/fd_io.h */
16-
//#include "checkpt/fd_checkpt.h" /* includes log/fd_log.h */
17-
//#include "shmem/fd_shmem.h" /* includes log/fd_log.h */
18-
//#include "tile/fd_tile.h" /* includes shmem/fd_shmem.h */
19-
//#include "scratch/fd_scratch.h" /* includes tile/fd_tile.h sanitize/fd_sanitize.h valloc/fd_valloc.h */
20-
//#include "tpool/fd_tpool.h" /* includes tile/fd_tile.h scratch/fd_scratch.h */
21-
//#include "wksp/fd_wksp.h" /* tpool/fd_tpool.h checkpt/fd_checkpt.h sanitize/fd_sanitize.h */
22-
#include "alloc/fd_alloc.h" /* includes wksp/fd_wksp.h valloc/fd_valloc.h */
23-
#include "rng/fd_rng.h" /* includes bits/fd_bits.h */
24-
25-
/* FIXME: Should these be optional APIs? */
26-
#include "sandbox/fd_sandbox.h" /* includes fd_util_base.h */
27-
//#include "math/fd_stat.h" /* includes bits/fd_bits.h */
28-
#include "bits/fd_sat.h" /* includes bits/fd_bits.h */
29-
//#include "hist/fd_histf.h" /* includes log/fd_log.h */
4+
//#include "fd_util_base.h" /* includes stdalign.h string.h limits.h float.h */
5+
//#include "sanitize/fd_sanitize.h" /* includes fd_util_base.h (fd_asan.h fd_msan.h) */
6+
//#include "valloc/fd_valloc.h" /* includes fd_util_base.h */ /* FIXME: deprecate? */
7+
//#include "bits/fd_bits.h" /* includes sanitize/fd_sanitize.h (fd_bits_find_lsb.h fd_bits_find_msb.h fd_bits_tg.h) */
8+
//#include "io/fd_io.h" /* includes bits/fd_bits.h */
9+
//#include "cstr/fd_cstr.h" /* includes bits/fd_bits.h */
10+
#include "rng/fd_rng.h" /* includes bits/fd_bits.h */
11+
#include "spad/fd_spad.h" /* includes bits/fd_bits.h valloc/fd_valloc.h */
12+
//#include "env/fd_env.h" /* includes cstr/fd_cstr.h */
13+
//#include "log/fd_log.h" /* includes env/fd_env.h io/fd_io.h */
14+
//#include "checkpt/fd_checkpt.h" /* includes log/fd_log.h */
15+
//#include "shmem/fd_shmem.h" /* includes log/fd_log.h */
16+
//#include "tile/fd_tile.h" /* includes shmem/fd_shmem.h */
17+
//#include "scratch/fd_scratch.h" /* includes tile/fd_tile.h valloc/fd_valloc.h */ /* FIXME: deprecate non alloca parts? */
18+
//#include "tpool/fd_tpool.h" /* includes scratch/fd_scratch.h */
19+
//#include "wksp/fd_wksp.h" /* includes tpool/fd_tpool.h checkpt/fd_checkpt.h */
20+
#include "alloc/fd_alloc.h" /* includes wksp/fd_wksp.h */
21+
22+
#include "sandbox/fd_sandbox.h" /* includes fd_util_base.h */ /* FIXME: should this be included by default? */
23+
#include "bits/fd_sat.h" /* includes bits/fd_bits.h */ /* FIXME: should this be incldued by default? */
3024

3125
/* Additional fd_util APIs that are not included by default */
3226

33-
//#include "archive/fd_ar.h" /* includes fd_util_base.h */
34-
//#include "net/fd_pcapng.h" /* includes fd_util_base.h */
35-
//#include "net/fd_eth.h" /* includes bits/fd_bits.h */
36-
//#include "net/fd_ip4.h" /* includes bits/fd_bits.h */
37-
//#include "net/fd_igmp.h" /* includes net/fd_ip4.h */
38-
//#include "net/fd_udp.h" /* includes net/fd_ip4.h */
39-
//#include "net/fd_net_headers.h */ /* includes net/fd_udp.h net/fd_eth.h */
40-
//#include "net/fd_pcap.h" /* includes net/fd_eth.h log/fd_log.h */
41-
//#include "bits/fd_float.h" /* includes bits/fd_bits.h */
42-
//#include "bits/fd_uwide.h" /* includes bits/fd_bits.h */
43-
//#include "math/fd_sqrt.h" /* includes bits/fd_bits.h */
44-
//#include "math/fd_fxp.h" /* includes math/fd_sqrt.h, (!FD_HAS_INT128) bits/fd_uwide.h */
45-
//#include "simd/fd_sse.h" /* includes bits/fd_bits.h, requires FD_HAS_SSE */
46-
//#include "simd/fd_avx.h" /* includes bits/fd_bits.h, requires FD_HAS_AVX */
47-
//#include "simd/fd_avx512.h" /* includes bits/fd_bits.h, requires FD_HAS_AVX512 */
27+
//#include "archive/fd_ar.h" /* includes fd_util_base.h */
28+
//#include "archive/fd_tar.h" /* includes fd_io.h */
29+
//#include "bits/fd_float.h" /* includes bits/fd_bits.h */
30+
//#include "bits/fd_uwide.h" /* includes bits/fd_bits.h */
31+
//#include "hist/fd_histf.h" /* includes log/fd_log.h math.h (FD_HAS_AVX) fd_avx.h */
32+
//#include "math/fd_stat.h" /* includes bits/fd_bits.h */
33+
//#include "math/fd_sqrt.h" /* includes bits/fd_bits.h */
34+
//#include "math/fd_fxp.h" /* includes math/fd_sqrt.h, (!FD_HAS_INT128) bits/fd_uwide.h */
35+
//#include "net/fd_pcapng.h" /* includes fd_util_base.h */
36+
//#include "net/fd_eth.h" /* includes bits/fd_bits.h */
37+
//#include "net/fd_ip4.h" /* includes bits/fd_bits.h */
38+
//#include "net/fd_igmp.h" /* includes net/fd_ip4.h */
39+
//#include "net/fd_udp.h" /* includes net/fd_ip4.h */
40+
//#include "net/fd_net_headers.h */ /* includes net/fd_udp.h net/fd_eth.h */
41+
//#include "net/fd_pcap.h" /* includes net/fd_eth.h log/fd_log.h */
42+
//#include "pod/fd_pod.h" /* includes cstr/fd_cstr.h */
43+
//#include "sanitize/fd_fuzz.h" /* includes fd_util_base.h */
44+
//#include "sanitize/fd_backtrace.h" /* FIXME: this probably should be merged with another header */
45+
//#include "simd/fd_sse.h" /* includes bits/fd_bits.h, requires FD_HAS_SSE */
46+
//#include "simd/fd_avx.h" /* includes bits/fd_bits.h, requires FD_HAS_AVX */
47+
//#include "simd/fd_avx512.h" /* includes bits/fd_bits.h, requires FD_HAS_AVX512 */
4848

4949
FD_PROTOTYPES_BEGIN
5050

src/util/scratch/fd_scratch.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
It is meant for use in situations that have very complex and large
1010
temporary memory usage. */
1111

12-
#include "../sanitize/fd_sanitize.h"
1312
#include "../tile/fd_tile.h"
1413
#include "../valloc/fd_valloc.h"
1514

src/util/spad/fd_spad.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
#ifndef HEADER_fd_src_util_spad_fd_spad_h
22
#define HEADER_fd_src_util_spad_fd_spad_h
33

4-
#include "../sanitize/fd_sanitize.h"
5-
64
/* APIs for high performance persistent inter-process shared scratch pad
75
memories. A spad as a scratch pad that behaves very much like a
86
thread's stack:
@@ -296,7 +294,6 @@ FD_FN_PURE static inline ulong fd_spad_mem_wmark( fd_spad_t const * spad ) { ret
296294

297295
FD_FN_PURE static inline int fd_spad_in_frame( fd_spad_t const * spad ) { return spad->frame_free<FD_SPAD_FRAME_MAX; }
298296

299-
300297
/* operations */
301298
/* fd_spad_alloc_max returns the maximum number of bytes with initial
302299
byte alignment of align that can currently be allocated / prepared
@@ -532,7 +529,6 @@ fd_spad_virtual( fd_spad_t * spad ) {
532529
return valloc;
533530
}
534531

535-
536532
/* fn implementations */
537533
static inline void
538534
fd_spad_reset_impl( fd_spad_t * spad ) {

0 commit comments

Comments
 (0)