5
5
#include < exprtk.hpp>
6
6
#include < numpy/ndarrayobject.h>
7
7
8
+ #include < arrow/type.h>
9
+ #include < arrow/table.h>
10
+ #include < arrow/c/abi.h>
11
+ #include < arrow/c/bridge.h>
12
+
13
+ #include < csp/adapters/parquet/ParquetReader.h>
14
+ #include < csp/adapters/utils/StructAdapterInfo.h>
15
+ #include < csp/adapters/utils/ValueDispatcher.h>
16
+
8
17
static void * init_nparray ()
9
18
{
10
19
csp::python::AcquireGIL gil;
@@ -325,6 +334,136 @@ DECLARE_CPPNODE( exprtk_impl )
325
334
326
335
EXPORT_CPPNODE ( exprtk_impl );
327
336
337
+ DECLARE_CPPNODE ( record_batches_to_struct )
338
+ {
339
+ using InMemoryTableParquetReader = csp::adapters::parquet::InMemoryTableParquetReader;
340
+ class MyTableReader : public InMemoryTableParquetReader
341
+ {
342
+ public:
343
+ MyTableReader ( std::vector<std::string> columns, std::shared_ptr<arrow::Schema> schema ):
344
+ InMemoryTableParquetReader ( nullptr , columns, false , {}, false )
345
+ {
346
+ m_schema = schema;
347
+ }
348
+ std::string getCurFileOrTableName () const override { return " IN_RECORD_BATCH" ; }
349
+ void initialize () { setColumnAdaptersFromCurrentTable (); }
350
+ void parseBatches ( std::vector<std::shared_ptr<arrow::RecordBatch>> record_batches )
351
+ {
352
+ // TODO: Check if the schema has not changed
353
+ auto table_result = arrow::Table::FromRecordBatches (record_batches);
354
+ if ( !table_result.ok () )
355
+ CSP_THROW ( NotImplemented, " Unable to make table from record batches" );
356
+
357
+ setTable ( table_result.ValueUnsafe () );
358
+
359
+ if ( !readNextRowGroup () )
360
+ CSP_THROW ( NotImplemented, " Unable to read row group from table" );
361
+
362
+ while ( readNextRow () )
363
+ {
364
+ for ( auto & adapter: getStructAdapters () )
365
+ {
366
+ adapter -> dispatchValue ( nullptr );
367
+ }
368
+ }
369
+ }
370
+ void stop ()
371
+ {
372
+ InMemoryTableParquetReader::clear ();
373
+ }
374
+ protected:
375
+ bool openNextFile () override { return false ; }
376
+ void clear () override { setTable ( nullptr ); }
377
+ };
378
+
379
+ SCALAR_INPUT ( DialectGenericType, schema_ptr );
380
+ SCALAR_INPUT ( StructMetaPtr, cls );
381
+ SCALAR_INPUT ( DictionaryPtr, properties );
382
+ TS_INPUT ( Generic, data );
383
+
384
+ TS_OUTPUT ( Generic );
385
+
386
+ std::shared_ptr<MyTableReader> reader;
387
+ CspTypePtr outType;
388
+ std::vector<StructPtr>* m_structsVecPtr;
389
+
390
+ using StructAdapterInfo = csp::adapters::utils::StructAdapterInfo;
391
+ using ValueDispatcher = csp::adapters::utils::ValueDispatcher<StructPtr &>;
392
+
393
+ INIT_CPPNODE ( record_batches_to_struct )
394
+ {
395
+ auto & input_def = tsinputDef ( " data" );
396
+ if ( input_def.type -> type () != CspType::Type::ARRAY )
397
+ CSP_THROW ( TypeError, " record_batches_to_struct expected ts array type, got " << input_def.type -> type () );
398
+
399
+ auto * aType = static_cast <const CspArrayType *>( input_def.type .get () );
400
+ CspTypePtr elemType = aType -> elemType ();
401
+ if ( elemType -> type () != CspType::Type::DIALECT_GENERIC )
402
+ CSP_THROW ( TypeError, " record_batches_to_struct expected ts array of DIALECT_GENERIC type, got " << elemType -> type () );
403
+
404
+ auto & output_def = tsoutputDef ( " " );
405
+ if ( output_def.type -> type () != CspType::Type::ARRAY )
406
+ CSP_THROW ( NotImplemented, " record_batches_to_struct expected ts array type, got " << output_def.type -> type () );
407
+ }
408
+
409
+ START ()
410
+ {
411
+ // Create Adapters for Schema
412
+ PyObject* capsule = csp::python::toPythonBorrowed (schema_ptr);
413
+ struct ArrowSchema * c_schema = reinterpret_cast <struct ArrowSchema *>( PyCapsule_GetPointer (capsule, " arrow_schema" ) );
414
+ auto result = arrow::ImportSchema (c_schema);
415
+ if ( !result.ok () )
416
+ CSP_THROW ( NotImplemented, " Unable to import schema" );
417
+ std::shared_ptr<arrow::Schema> schema = result.ValueUnsafe ();
418
+ std::vector<std::string> columns;
419
+ auto field_map = properties.value () -> get<DictionaryPtr>( " field_map" );
420
+ for ( auto it = field_map -> begin (); it != field_map -> end (); ++it )
421
+ {
422
+ // TODO: Check if the column exists in the table
423
+ columns.push_back (it.key ());
424
+ }
425
+ reader = std::make_shared<MyTableReader>( columns, schema );
426
+ reader -> initialize ();
427
+
428
+ outType = std::make_shared<csp::CspStructType>( cls.value () );
429
+ StructAdapterInfo key{ outType, field_map };
430
+ auto & struct_adapter = reader -> getStructAdapter ( key );
431
+ struct_adapter.addSubscriber ( [this ]( StructPtr * s )
432
+ {
433
+ if ( s ) this -> m_structsVecPtr -> push_back ( *s );
434
+ else CSP_THROW ( NotImplemented, " StructPtr was null" );
435
+ }, {} );
436
+ }
437
+
438
+ INVOKE ()
439
+ {
440
+ if ( csp.ticked ( data ) )
441
+ {
442
+ auto & py_batches = data.lastValue <std::vector<DialectGenericType>>();
443
+ std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
444
+ for ( auto & py_batch: py_batches )
445
+ {
446
+ PyObject* py_tuple = csp::python::toPythonBorrowed ( py_batch );
447
+ PyObject* py_schema = PyTuple_GET_ITEM ( py_tuple, 0 );
448
+ PyObject* py_array = PyTuple_GET_ITEM ( py_tuple, 1 );
449
+ struct ArrowSchema * c_schema = reinterpret_cast <struct ArrowSchema *>( PyCapsule_GetPointer ( py_schema, " arrow_schema" ) );
450
+ struct ArrowArray * c_array = reinterpret_cast <struct ArrowArray *>( PyCapsule_GetPointer ( py_array, " arrow_array" ) );
451
+ auto result = arrow::ImportRecordBatch (c_array, c_schema);
452
+ if ( !result.ok () )
453
+ CSP_THROW ( NotImplemented, " Unable to import record batch from c interface" );
454
+ batches.emplace_back (result.ValueUnsafe ());
455
+ }
456
+ std::vector<StructPtr> & out = unnamed_output ().reserveSpace <std::vector<StructPtr>>();
457
+ out.clear ();
458
+ m_structsVecPtr = &out;
459
+ reader -> parseBatches ( batches );
460
+ m_structsVecPtr = nullptr ;
461
+ }
462
+ }
463
+ };
464
+
465
+ EXPORT_CPPNODE ( record_batches_to_struct );
466
+
328
467
}
329
468
330
469
// Base nodes
@@ -350,6 +489,7 @@ REGISTER_CPPNODE( csp::cppnodes, struct_fromts );
350
489
REGISTER_CPPNODE ( csp::cppnodes, struct_collectts );
351
490
352
491
REGISTER_CPPNODE ( csp::cppnodes, exprtk_impl );
492
+ REGISTER_CPPNODE ( csp::cppnodes, record_batches_to_struct );
353
493
354
494
static PyModuleDef _cspbaselibimpl_module = {
355
495
PyModuleDef_HEAD_INIT,
0 commit comments