Closed
Description
Feature Type
-
Adding new functionality to pandas
-
Changing existing functionality in pandas
-
Removing existing functionality in pandas
Problem Description
I want to read a parquet file but have control over how the pyarrow Table is converted to a pandas dataframe by specifying the to_pandas_kwargs
argument in the call to Table.to_parquet()
.
import pyarrow as pa
import pyarrow.parquet as pq
import datetime
# write
arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us"))
table = pa.table([arr], names=["timestamp"])
pq.write_table(table, "test.parquet")
# read
import pandas as pd
pd.read_parquet("test.parquet")
That raises with
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
Cell In [43], line 10
7 pq.write_table(table, "test.parquet")
9 import pandas as pd
---> 10 pd.read_parquet("test.parquet")
File /srv/conda/envs/notebook/lib/python3.10/site-packages/pandas/io/parquet.py:501, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, **kwargs)
454 """
455 Load a parquet object from the file path, returning a DataFrame.
456
(...)
497 DataFrame
498 """
499 impl = get_engine(engine)
--> 501 return impl.read(
502 path,
503 columns=columns,
504 storage_options=storage_options,
505 use_nullable_dtypes=use_nullable_dtypes,
506 **kwargs,
507 )
File /srv/conda/envs/notebook/lib/python3.10/site-packages/pandas/io/parquet.py:249, in PyArrowImpl.read(self, path, columns, use_nullable_dtypes, storage_options, **kwargs)
242 path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
243 path,
244 kwargs.pop("filesystem", None),
245 storage_options=storage_options,
246 mode="rb",
247 )
248 try:
--> 249 result = self.api.parquet.read_table(
250 path_or_handle, columns=columns, **kwargs
251 ).to_pandas(**to_pandas_kwargs)
252 if manager == "array":
253 result = result._as_manager("array", copy=False)
File /srv/conda/envs/notebook/lib/python3.10/site-packages/pyarrow/array.pxi:823, in pyarrow.lib._PandasConvertible.to_pandas()
File /srv/conda/envs/notebook/lib/python3.10/site-packages/pyarrow/table.pxi:3913, in pyarrow.lib.Table._to_pandas()
File /srv/conda/envs/notebook/lib/python3.10/site-packages/pyarrow/pandas_compat.py:818, in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
816 _check_data_column_metadata_consistency(all_columns)
817 columns = _deserialize_column_index(table, all_columns, column_indexes)
--> 818 blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
820 axes = [columns, index]
821 return BlockManager(blocks, axes)
File /srv/conda/envs/notebook/lib/python3.10/site-packages/pyarrow/pandas_compat.py:1168, in _table_to_blocks(options, block_table, categories, extension_columns)
1163 def _table_to_blocks(options, block_table, categories, extension_columns):
1164 # Part of table_to_blockmanager
1165
1166 # Convert an arrow table to Block from the internal pandas API
1167 columns = block_table.column_names
-> 1168 result = pa.lib.table_to_blocks(options, block_table, categories,
1169 list(extension_columns.keys()))
1170 return [_reconstruct_block(item, columns, extension_columns)
1171 for item in result]
File /srv/conda/envs/notebook/lib/python3.10/site-packages/pyarrow/table.pxi:2602, in pyarrow.lib.table_to_blocks()
File /srv/conda/envs/notebook/lib/python3.10/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()
ArrowInvalid: Casting from timestamp[us] to timestamp[ns] would result in out of bounds timestamp: -11676096000000000
The solution, in pyarrow, is to pass timestamp_as_object=True
in the call .to_pandas()
.
Feature Description
Add a new parameter to read_parquet
(technically just the arrow engine, but adding it here for docs)
pd.read_parquet(
path: 'FilePath | ReadBuffer[bytes]',
engine: 'str' = 'auto',
columns: 'list[str] | None' = None,
storage_options: 'StorageOptions' = None,
use_nullable_dtypes: 'bool' = False,
to_pandas_kwargs: dict[str, Any] | None = None,
**kwargs,
) -> 'DataFrame'
"""
to_pandas_kwargs:
Additional keyword arguments passed to :meth:`pyarrow.Table.to_pandas` to control
how the pyarrow Table is converted to a pandas DataFrame. By default,
the `use_nullable_dtypes` option controls whether the `types_mapper` argument
is set.
"""
Alternative Solutions
Just use pyarrow :)
Additional Context
No response