12
12
13
13
#include < csp/adapters/parquet/ParquetReader.h>
14
14
#include < csp/adapters/utils/StructAdapterInfo.h>
15
+ #include < csp/adapters/parquet/ParquetOutputAdapter.h>
16
+ #include < csp/adapters/parquet/ParquetWriter.h>
17
+ #include < csp/python/PyObjectPtr.h>
15
18
16
19
static void * init_nparray ()
17
20
{
@@ -21,6 +24,24 @@ static void * init_nparray()
21
24
}
22
25
static void * s_init_array = init_nparray();
23
26
27
+ void ReleaseArrowSchemaPyCapsule ( PyObject* capsule ) {
28
+ struct ArrowSchema * schema =
29
+ ( struct ArrowSchema * )PyCapsule_GetPointer ( capsule, " arrow_schema" );
30
+ if ( schema->release != NULL ) {
31
+ schema->release ( schema );
32
+ }
33
+ free ( schema );
34
+ }
35
+
36
+ void ReleaseArrowArrayPyCapsule ( PyObject* capsule ) {
37
+ struct ArrowArray * array =
38
+ ( struct ArrowArray * )PyCapsule_GetPointer ( capsule, " arrow_array" );
39
+ if ( array->release != NULL ) {
40
+ array->release ( array );
41
+ }
42
+ free ( array );
43
+ }
44
+
24
45
namespace csp ::cppnodes
25
46
{
26
47
DECLARE_CPPNODE ( exprtk_impl )
@@ -403,9 +424,9 @@ DECLARE_CPPNODE( record_batches_to_struct )
403
424
START ()
404
425
{
405
426
// Create Adapters for Schema
406
- PyObject* capsule = csp::python::toPythonBorrowed (schema_ptr);
407
- struct ArrowSchema * c_schema = reinterpret_cast <struct ArrowSchema *>( PyCapsule_GetPointer (capsule, " arrow_schema" ) );
408
- auto result = arrow::ImportSchema (c_schema);
427
+ PyObject* capsule = csp::python::toPythonBorrowed ( schema_ptr );
428
+ struct ArrowSchema * c_schema = reinterpret_cast <struct ArrowSchema *>( PyCapsule_GetPointer ( capsule, " arrow_schema" ) );
429
+ auto result = arrow::ImportSchema ( c_schema );
409
430
if ( !result.ok () )
410
431
CSP_THROW ( ValueError, " Failed to load the arrow schema: " << result.status ().ToString () );
411
432
std::shared_ptr<arrow::Schema> schema = result.ValueUnsafe ();
@@ -414,15 +435,15 @@ DECLARE_CPPNODE( record_batches_to_struct )
414
435
for ( auto it = field_map -> begin (); it != field_map -> end (); ++it )
415
436
{
416
437
if ( schema -> GetFieldByName ( it.key () ) )
417
- columns.push_back (it.key ());
438
+ columns.push_back ( it.key () );
418
439
else
419
440
CSP_THROW ( ValueError, " column " << it.key () << " not found in schema" );
420
441
}
421
442
reader = std::make_shared<RecordBatchReader>( columns, schema );
422
443
reader -> initialize ();
423
444
424
445
CspTypePtr outType = std::make_shared<csp::CspStructType>( cls.value () );
425
- csp::adapters::utils::StructAdapterInfo key{ std::move (outType), std::move (field_map) };
446
+ csp::adapters::utils::StructAdapterInfo key{ std::move ( outType ), std::move ( field_map ) };
426
447
auto & struct_adapter = reader -> getStructAdapter ( key );
427
448
struct_adapter.addSubscriber ( [this ]( StructPtr * s )
428
449
{
@@ -444,10 +465,10 @@ DECLARE_CPPNODE( record_batches_to_struct )
444
465
PyObject* py_array = PyTuple_GET_ITEM ( py_tuple, 1 );
445
466
struct ArrowSchema * c_schema = reinterpret_cast <struct ArrowSchema *>( PyCapsule_GetPointer ( py_schema, " arrow_schema" ) );
446
467
struct ArrowArray * c_array = reinterpret_cast <struct ArrowArray *>( PyCapsule_GetPointer ( py_array, " arrow_array" ) );
447
- auto result = arrow::ImportRecordBatch (c_array, c_schema);
468
+ auto result = arrow::ImportRecordBatch ( c_array, c_schema );
448
469
if ( !result.ok () )
449
470
CSP_THROW ( ValueError, " Failed to load record batches through PyCapsule C Data interface: " << result.status ().ToString () );
450
- batches.emplace_back (result.ValueUnsafe ());
471
+ batches.emplace_back ( result.ValueUnsafe () );
451
472
}
452
473
std::vector<StructPtr> & out = unnamed_output ().reserveSpace <std::vector<StructPtr>>();
453
474
out.clear ();
@@ -460,6 +481,120 @@ DECLARE_CPPNODE( record_batches_to_struct )
460
481
461
482
EXPORT_CPPNODE ( record_batches_to_struct );
462
483
484
+ DECLARE_CPPNODE ( struct_to_record_batches )
485
+ {
486
+ SCALAR_INPUT ( DialectGenericType, schema_ptr );
487
+ SCALAR_INPUT ( StructMetaPtr, cls );
488
+ SCALAR_INPUT ( DictionaryPtr, properties );
489
+ SCALAR_INPUT ( int64_t , chunk_size );
490
+ TS_INPUT ( Generic, data );
491
+
492
+ TS_OUTPUT ( Generic );
493
+
494
+ using StructParquetOutputHandler = csp::adapters::parquet::StructParquetOutputHandler;
495
+ using ParquetWriter = csp::adapters::parquet::ParquetWriter;
496
+ class MyParquetWriter : public ParquetWriter
497
+ {
498
+ public:
499
+ MyParquetWriter ( int64_t chunk_size ): ParquetWriter(), m_chunkSize( chunk_size )
500
+ {
501
+ if ( m_chunkSize <= 0 )
502
+ {
503
+ CSP_THROW ( ValueError, " Chunk size should be >= 0" );
504
+ }
505
+ }
506
+ std::uint32_t getChunkSize () const override { return m_chunkSize; }
507
+ private:
508
+ int64_t m_chunkSize = 0 ;
509
+ };
510
+
511
+ std::shared_ptr<StructParquetOutputHandler> m_handler;
512
+ CspTypePtr m_cspType;
513
+ std::shared_ptr<MyParquetWriter> m_writer;
514
+ std::shared_ptr<arrow::Schema> m_schema;
515
+
516
+ INIT_CPPNODE ( struct_to_record_batches )
517
+ {
518
+ auto & input_def = tsinputDef ( " data" );
519
+ if ( input_def.type -> type () != CspType::Type::ARRAY )
520
+ CSP_THROW ( TypeError, " struct_to_record_batches expected ts array type, got " << input_def.type -> type () );
521
+
522
+ auto * aType = static_cast <const CspArrayType *>( input_def.type .get () );
523
+ CspTypePtr elemType = aType -> elemType ();
524
+ if ( elemType -> type () != CspType::Type::STRUCT )
525
+ CSP_THROW ( TypeError, " struct_to_record_batches expected ts array of structs type, got " << elemType -> type () );
526
+
527
+ auto & output_def = tsoutputDef ( " " );
528
+ if ( output_def.type -> type () != CspType::Type::ARRAY )
529
+ CSP_THROW ( TypeError, " struct_to_record_batches expected ts array type, got " << output_def.type -> type () );
530
+ }
531
+
532
+ START ()
533
+ {
534
+ // Create Adapters for Schema
535
+ auto field_map = properties.value () -> get<DictionaryPtr>( " field_map" );
536
+ m_writer = std::make_shared<MyParquetWriter>( chunk_size.value () );
537
+ m_cspType = std::make_shared<csp::CspStructType>( cls.value () );
538
+ m_handler = std::make_shared<StructParquetOutputHandler>( engine (), *m_writer, m_cspType, field_map );
539
+ std::vector<std::shared_ptr<arrow::Field>> arrowFields;
540
+ for ( unsigned i = 0 ; i < m_handler -> getNumColumns (); i++ )
541
+ {
542
+ arrowFields.push_back ( arrow::field ( m_handler -> getColumnArrayBuilder ( i ) -> getColumnName (),
543
+ m_handler -> getColumnArrayBuilder ( i ) -> getDataType () ) );
544
+ }
545
+ m_schema = arrow::schema ( arrowFields );
546
+ }
547
+
548
+ DialectGenericType getData ( std::shared_ptr<StructParquetOutputHandler> handler, int num_rows )
549
+ {
550
+ std::vector<std::shared_ptr<arrow::Array>> columns;
551
+ columns.reserve ( handler -> getNumColumns () );
552
+ for ( unsigned i = 0 ; i < handler -> getNumColumns (); i++ )
553
+ {
554
+ columns.push_back ( handler -> getColumnArrayBuilder ( i ) -> buildArray () );
555
+ }
556
+ auto rb_ptr = arrow::RecordBatch::Make ( m_schema, num_rows, columns );
557
+ const arrow::RecordBatch& rb = *rb_ptr;
558
+ struct ArrowSchema * rb_schema = ( struct ArrowSchema * )malloc ( sizeof ( struct ArrowSchema ) );
559
+ struct ArrowArray * rb_array = ( struct ArrowArray * )malloc ( sizeof ( struct ArrowArray ) );
560
+ arrow::Status st = arrow::ExportRecordBatch ( rb, rb_array, rb_schema );
561
+ auto py_schema = csp::python::PyObjectPtr::own ( PyCapsule_New ( rb_schema, " arrow_schema" , ReleaseArrowSchemaPyCapsule ) );
562
+ auto py_array = csp::python::PyObjectPtr::own ( PyCapsule_New ( rb_array, " arrow_array" , ReleaseArrowArrayPyCapsule ) );
563
+ auto py_tuple = csp::python::PyObjectPtr::own ( PyTuple_Pack ( 2 , py_schema.get (), py_array.get () ) );
564
+ return csp::python::fromPython<DialectGenericType>( py_tuple.get () );
565
+ }
566
+
567
+ INVOKE ()
568
+ {
569
+ if ( csp.ticked ( data ) )
570
+ {
571
+ std::vector<DialectGenericType> & out = unnamed_output ().reserveSpace <std::vector<DialectGenericType>>();
572
+ out.clear ();
573
+ auto & structs = data.lastValue <std::vector<StructPtr>>();
574
+ uint32_t cur_chunk_size = 0 ;
575
+ for ( auto & st: structs )
576
+ {
577
+ m_handler -> writeValueFromArgs ( st );
578
+ for ( unsigned i = 0 ; i < m_handler -> getNumColumns (); i++ )
579
+ {
580
+ m_handler -> getColumnArrayBuilder ( i ) -> handleRowFinished ();
581
+ }
582
+ if ( ++cur_chunk_size >= m_writer -> getChunkSize () )
583
+ {
584
+ out.emplace_back ( getData ( m_handler, cur_chunk_size ) );
585
+ cur_chunk_size = 0 ;
586
+ }
587
+ }
588
+ if ( cur_chunk_size > 0 )
589
+ {
590
+ out.emplace_back ( getData ( m_handler, cur_chunk_size ) );
591
+ }
592
+ }
593
+ }
594
+ };
595
+
596
+ EXPORT_CPPNODE ( struct_to_record_batches );
597
+
463
598
}
464
599
465
600
// Base nodes
@@ -486,6 +621,7 @@ REGISTER_CPPNODE( csp::cppnodes, struct_collectts );
486
621
487
622
REGISTER_CPPNODE ( csp::cppnodes, exprtk_impl );
488
623
REGISTER_CPPNODE ( csp::cppnodes, record_batches_to_struct );
624
+ REGISTER_CPPNODE ( csp::cppnodes, struct_to_record_batches );
489
625
490
626
static PyModuleDef _cspbaselibimpl_module = {
491
627
PyModuleDef_HEAD_INIT,
0 commit comments