@@ -3253,6 +3253,18 @@ def __init__(self, tbl: Table) -> None:
3253
3253
except ModuleNotFoundError as e :
3254
3254
raise ModuleNotFoundError ("For metadata operations PyArrow needs to be installed" ) from e
3255
3255
3256
+ def _get_snapshot (self , snapshot_id : Optional [int ] = None ) -> Snapshot :
3257
+ if snapshot_id is not None :
3258
+ if snapshot := self .tbl .metadata .snapshot_by_id (snapshot_id ):
3259
+ return snapshot
3260
+ else :
3261
+ raise ValueError (f"Cannot find snapshot with ID { snapshot_id } " )
3262
+
3263
+ if snapshot := self .tbl .metadata .current_snapshot ():
3264
+ return snapshot
3265
+ else :
3266
+ raise ValueError ("Cannot get a snapshot as the table does not have any." )
3267
+
3256
3268
def snapshots (self ) -> "pa.Table" :
3257
3269
import pyarrow as pa
3258
3270
@@ -3287,7 +3299,7 @@ def snapshots(self) -> "pa.Table":
3287
3299
schema = snapshots_schema ,
3288
3300
)
3289
3301
3290
- def entries (self ) -> "pa.Table" :
3302
+ def entries (self , snapshot_id : Optional [ int ] = None ) -> "pa.Table" :
3291
3303
import pyarrow as pa
3292
3304
3293
3305
from pyiceberg .io .pyarrow import schema_to_pyarrow
@@ -3346,64 +3358,64 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
3346
3358
])
3347
3359
3348
3360
entries = []
3349
- if snapshot := self .tbl .metadata .current_snapshot ():
3350
- for manifest in snapshot .manifests (self .tbl .io ):
3351
- for entry in manifest .fetch_manifest_entry (io = self .tbl .io ):
3352
- column_sizes = entry .data_file .column_sizes or {}
3353
- value_counts = entry .data_file .value_counts or {}
3354
- null_value_counts = entry .data_file .null_value_counts or {}
3355
- nan_value_counts = entry .data_file .nan_value_counts or {}
3356
- lower_bounds = entry .data_file .lower_bounds or {}
3357
- upper_bounds = entry .data_file .upper_bounds or {}
3358
- readable_metrics = {
3359
- schema .find_column_name (field .field_id ): {
3360
- "column_size" : column_sizes .get (field .field_id ),
3361
- "value_count" : value_counts .get (field .field_id ),
3362
- "null_value_count" : null_value_counts .get (field .field_id ),
3363
- "nan_value_count" : nan_value_counts .get (field .field_id ),
3364
- # Makes them readable
3365
- "lower_bound" : from_bytes (field .field_type , lower_bound )
3366
- if (lower_bound := lower_bounds .get (field .field_id ))
3367
- else None ,
3368
- "upper_bound" : from_bytes (field .field_type , upper_bound )
3369
- if (upper_bound := upper_bounds .get (field .field_id ))
3370
- else None ,
3371
- }
3372
- for field in self .tbl .metadata .schema ().fields
3373
- }
3374
-
3375
- partition = entry .data_file .partition
3376
- partition_record_dict = {
3377
- field .name : partition [pos ]
3378
- for pos , field in enumerate (self .tbl .metadata .specs ()[manifest .partition_spec_id ].fields )
3361
+ snapshot = self ._get_snapshot (snapshot_id )
3362
+ for manifest in snapshot .manifests (self .tbl .io ):
3363
+ for entry in manifest .fetch_manifest_entry (io = self .tbl .io ):
3364
+ column_sizes = entry .data_file .column_sizes or {}
3365
+ value_counts = entry .data_file .value_counts or {}
3366
+ null_value_counts = entry .data_file .null_value_counts or {}
3367
+ nan_value_counts = entry .data_file .nan_value_counts or {}
3368
+ lower_bounds = entry .data_file .lower_bounds or {}
3369
+ upper_bounds = entry .data_file .upper_bounds or {}
3370
+ readable_metrics = {
3371
+ schema .find_column_name (field .field_id ): {
3372
+ "column_size" : column_sizes .get (field .field_id ),
3373
+ "value_count" : value_counts .get (field .field_id ),
3374
+ "null_value_count" : null_value_counts .get (field .field_id ),
3375
+ "nan_value_count" : nan_value_counts .get (field .field_id ),
3376
+ # Makes them readable
3377
+ "lower_bound" : from_bytes (field .field_type , lower_bound )
3378
+ if (lower_bound := lower_bounds .get (field .field_id ))
3379
+ else None ,
3380
+ "upper_bound" : from_bytes (field .field_type , upper_bound )
3381
+ if (upper_bound := upper_bounds .get (field .field_id ))
3382
+ else None ,
3379
3383
}
3380
-
3381
- entries .append ({
3382
- 'status' : entry .status .value ,
3383
- 'snapshot_id' : entry .snapshot_id ,
3384
- 'sequence_number' : entry .data_sequence_number ,
3385
- 'file_sequence_number' : entry .file_sequence_number ,
3386
- 'data_file' : {
3387
- "content" : entry .data_file .content ,
3388
- "file_path" : entry .data_file .file_path ,
3389
- "file_format" : entry .data_file .file_format ,
3390
- "partition" : partition_record_dict ,
3391
- "record_count" : entry .data_file .record_count ,
3392
- "file_size_in_bytes" : entry .data_file .file_size_in_bytes ,
3393
- "column_sizes" : dict (entry .data_file .column_sizes ),
3394
- "value_counts" : dict (entry .data_file .value_counts ),
3395
- "null_value_counts" : dict (entry .data_file .null_value_counts ),
3396
- "nan_value_counts" : entry .data_file .nan_value_counts ,
3397
- "lower_bounds" : entry .data_file .lower_bounds ,
3398
- "upper_bounds" : entry .data_file .upper_bounds ,
3399
- "key_metadata" : entry .data_file .key_metadata ,
3400
- "split_offsets" : entry .data_file .split_offsets ,
3401
- "equality_ids" : entry .data_file .equality_ids ,
3402
- "sort_order_id" : entry .data_file .sort_order_id ,
3403
- "spec_id" : entry .data_file .spec_id ,
3404
- },
3405
- 'readable_metrics' : readable_metrics ,
3406
- })
3384
+ for field in self .tbl .metadata .schema ().fields
3385
+ }
3386
+
3387
+ partition = entry .data_file .partition
3388
+ partition_record_dict = {
3389
+ field .name : partition [pos ]
3390
+ for pos , field in enumerate (self .tbl .metadata .specs ()[manifest .partition_spec_id ].fields )
3391
+ }
3392
+
3393
+ entries .append ({
3394
+ 'status' : entry .status .value ,
3395
+ 'snapshot_id' : entry .snapshot_id ,
3396
+ 'sequence_number' : entry .data_sequence_number ,
3397
+ 'file_sequence_number' : entry .file_sequence_number ,
3398
+ 'data_file' : {
3399
+ "content" : entry .data_file .content ,
3400
+ "file_path" : entry .data_file .file_path ,
3401
+ "file_format" : entry .data_file .file_format ,
3402
+ "partition" : partition_record_dict ,
3403
+ "record_count" : entry .data_file .record_count ,
3404
+ "file_size_in_bytes" : entry .data_file .file_size_in_bytes ,
3405
+ "column_sizes" : dict (entry .data_file .column_sizes ),
3406
+ "value_counts" : dict (entry .data_file .value_counts ),
3407
+ "null_value_counts" : dict (entry .data_file .null_value_counts ),
3408
+ "nan_value_counts" : entry .data_file .nan_value_counts ,
3409
+ "lower_bounds" : entry .data_file .lower_bounds ,
3410
+ "upper_bounds" : entry .data_file .upper_bounds ,
3411
+ "key_metadata" : entry .data_file .key_metadata ,
3412
+ "split_offsets" : entry .data_file .split_offsets ,
3413
+ "equality_ids" : entry .data_file .equality_ids ,
3414
+ "sort_order_id" : entry .data_file .sort_order_id ,
3415
+ "spec_id" : entry .data_file .spec_id ,
3416
+ },
3417
+ 'readable_metrics' : readable_metrics ,
3418
+ })
3407
3419
3408
3420
return pa .Table .from_pylist (
3409
3421
entries ,
0 commit comments