-
Notifications
You must be signed in to change notification settings - Fork 173
Extract Delta Lake deletion vectors #661
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.xtable.model.storage; | ||
|
||
import java.util.Iterator; | ||
import java.util.function.Supplier; | ||
|
||
import lombok.AccessLevel; | ||
import lombok.EqualsAndHashCode; | ||
import lombok.Getter; | ||
import lombok.NonNull; | ||
import lombok.ToString; | ||
import lombok.experimental.Accessors; | ||
import lombok.experimental.FieldDefaults; | ||
import lombok.experimental.SuperBuilder; | ||
|
||
@Accessors(fluent = true) | ||
@SuperBuilder(toBuilder = true) | ||
@FieldDefaults(makeFinal = true, level = lombok.AccessLevel.PRIVATE) | ||
@Getter | ||
@ToString(callSuper = true) | ||
@EqualsAndHashCode(callSuper = true) | ||
public class InternalDeletionVector extends InternalFile { | ||
// path (absolute with scheme) of data file to which this deletion vector belongs | ||
@NonNull String dataFilePath; | ||
|
||
// super.getFileSizeBytes() is the size of the deletion vector file | ||
// super.getPhysicalPath() is the absolute path (with scheme) of the deletion vector file | ||
// super.getRecordCount() is the count of records in the deletion vector file | ||
|
||
// offset of deletion vector start in a deletion vector file | ||
int offset; | ||
|
||
/** | ||
* binary representation of the deletion vector. The consumer can use the {@link | ||
* #ordinalsIterator()} to extract the ordinals represented in the binary format. | ||
*/ | ||
byte[] binaryRepresentation; | ||
|
||
/** | ||
* Supplier for an iterator that returns the ordinals of records deleted by this deletion vector | ||
* in the linked data file, identified by {@link #dataFilePath}. | ||
* | ||
* <p>The {@link InternalDeletionVector} instance does not guarantee that a new or distinct result | ||
* will be returned each time the supplier is invoked. However, the supplier is expected to return | ||
* a new iterator for each call. | ||
*/ | ||
@Getter(AccessLevel.NONE) | ||
Supplier<Iterator<Long>> ordinalsSupplier; | ||
|
||
/** | ||
* @return An iterator that returns the ordinals of records deleted by this deletion vector in the | ||
* linked data file. There is no guarantee that a new or distinct iterator will be returned | ||
* each time the iterator is invoked. | ||
*/ | ||
public Iterator<Long> ordinalsIterator() { | ||
return ordinalsSupplier.get(); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,11 +22,11 @@ | |
import java.time.Instant; | ||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Optional; | ||
import java.util.Set; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.Stream; | ||
|
||
import lombok.Builder; | ||
import lombok.extern.log4j.Log4j2; | ||
|
@@ -53,6 +53,8 @@ | |
import org.apache.xtable.model.schema.InternalSchema; | ||
import org.apache.xtable.model.storage.FileFormat; | ||
import org.apache.xtable.model.storage.InternalDataFile; | ||
import org.apache.xtable.model.storage.InternalDeletionVector; | ||
import org.apache.xtable.model.storage.InternalFile; | ||
import org.apache.xtable.model.storage.InternalFilesDiff; | ||
import org.apache.xtable.model.storage.PartitionFileGroup; | ||
import org.apache.xtable.spi.extractor.ConversionSource; | ||
|
@@ -113,8 +115,8 @@ public TableChange getTableChangeForCommit(Long versionNumber) { | |
// All 3 of the following data structures use data file's absolute path as the key | ||
Map<String, InternalDataFile> addedFiles = new HashMap<>(); | ||
Map<String, InternalDataFile> removedFiles = new HashMap<>(); | ||
// Set of data file paths for which deletion vectors exists. | ||
Set<String> deletionVectors = new HashSet<>(); | ||
// Map of data file paths for which deletion vectors exists. | ||
Map<String, InternalDeletionVector> deletionVectors = new HashMap<>(); | ||
|
||
for (Action action : actionsForVersion) { | ||
if (action instanceof AddFile) { | ||
|
@@ -129,10 +131,10 @@ public TableChange getTableChangeForCommit(Long versionNumber) { | |
DeltaPartitionExtractor.getInstance(), | ||
DeltaStatsExtractor.getInstance()); | ||
addedFiles.put(dataFile.getPhysicalPath(), dataFile); | ||
String deleteVectorPath = | ||
actionsConverter.extractDeletionVectorFile(snapshotAtVersion, (AddFile) action); | ||
if (deleteVectorPath != null) { | ||
deletionVectors.add(deleteVectorPath); | ||
InternalDeletionVector deletionVector = | ||
actionsConverter.extractDeletionVector(snapshotAtVersion, (AddFile) action); | ||
if (deletionVector != null) { | ||
deletionVectors.put(deletionVector.dataFilePath(), deletionVector); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. deletionVector.dataFilePath() points to the path of the associated Parquet Data File. We should use deletionVector.getPhysicalPath() instead. Thoughts? @ashvina There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the review @piyushdubey There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense. I see on line 168, you are concatenating the deletion vectors to the Internal files, which will not add dv and data file both without skipping. Thanks for the clarification. |
||
} | ||
} else if (action instanceof RemoveFile) { | ||
InternalDataFile dataFile = | ||
|
@@ -151,7 +153,7 @@ public TableChange getTableChangeForCommit(Long versionNumber) { | |
// entry which is replaced by a new entry, AddFile with delete vector information. Since the | ||
// same data file is removed and added, we need to remove it from the added and removed file | ||
// maps which are used to track actual added and removed data files. | ||
for (String deletionVector : deletionVectors) { | ||
for (String deletionVector : deletionVectors.keySet()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick: the name |
||
// validate that a Remove action is also added for the data file | ||
if (removedFiles.containsKey(deletionVector)) { | ||
addedFiles.remove(deletionVector); | ||
|
@@ -163,11 +165,15 @@ public TableChange getTableChangeForCommit(Long versionNumber) { | |
} | ||
} | ||
|
||
List<InternalFile> allAddedFiles = | ||
Stream.concat(addedFiles.values().stream(), deletionVectors.values().stream()) | ||
.collect(Collectors.toList()); | ||
InternalFilesDiff internalFilesDiff = | ||
InternalFilesDiff.builder() | ||
.filesAdded(addedFiles.values()) | ||
.filesAdded(allAddedFiles) | ||
.filesRemoved(removedFiles.values()) | ||
.build(); | ||
|
||
return TableChange.builder() | ||
.tableAsOfChange(tableAtVersion) | ||
.filesDiff(internalFilesDiff) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,11 +19,16 @@ | |
package org.apache.xtable.delta; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
import static org.junit.jupiter.api.Assertions.assertNotNull; | ||
|
||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
import java.time.Instant; | ||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.stream.Collectors; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
|
@@ -38,6 +43,7 @@ | |
|
||
import org.apache.spark.sql.delta.DeltaLog; | ||
import org.apache.spark.sql.delta.actions.AddFile; | ||
import org.apache.spark.sql.delta.actions.DeletionVectorDescriptor; | ||
|
||
import scala.Option; | ||
|
||
|
@@ -49,13 +55,15 @@ | |
import org.apache.xtable.model.InstantsForIncrementalSync; | ||
import org.apache.xtable.model.InternalSnapshot; | ||
import org.apache.xtable.model.TableChange; | ||
import org.apache.xtable.model.storage.InternalDeletionVector; | ||
import org.apache.xtable.model.storage.TableFormat; | ||
|
||
public class ITDeltaDeleteVectorConvert { | ||
@TempDir private static Path tempDir; | ||
private static SparkSession sparkSession; | ||
|
||
private DeltaConversionSourceProvider conversionSourceProvider; | ||
private TestSparkDeltaTable testSparkDeltaTable; | ||
|
||
@BeforeAll | ||
public static void setupOnce() { | ||
|
@@ -91,11 +99,24 @@ void setUp() { | |
conversionSourceProvider.init(hadoopConf); | ||
} | ||
|
||
private static class TableState { | ||
Map<String, AddFile> activeFiles; | ||
List<Row> rowsToDelete; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This list looks like it is unused, is that intentional? |
||
|
||
TableState(Map<String, AddFile> activeFiles) { | ||
this(activeFiles, Collections.emptyList()); | ||
} | ||
|
||
TableState(Map<String, AddFile> activeFiles, List<Row> rowsToDelete) { | ||
this.activeFiles = activeFiles; | ||
this.rowsToDelete = rowsToDelete; | ||
} | ||
} | ||
|
||
@Test | ||
public void testInsertsUpsertsAndDeletes() { | ||
String tableName = GenericTable.getTableName(); | ||
TestSparkDeltaTable testSparkDeltaTable = | ||
new TestSparkDeltaTable(tableName, tempDir, sparkSession, null, false); | ||
testSparkDeltaTable = new TestSparkDeltaTable(tableName, tempDir, sparkSession, null, false); | ||
|
||
// enable deletion vectors for the test table | ||
testSparkDeltaTable | ||
|
@@ -105,25 +126,30 @@ public void testInsertsUpsertsAndDeletes() { | |
+ tableName | ||
+ " SET TBLPROPERTIES ('delta.enableDeletionVectors' = true)"); | ||
|
||
List<List<String>> allActiveFiles = new ArrayList<>(); | ||
List<TableState> testTableStates = new ArrayList<>(); | ||
List<TableChange> allTableChanges = new ArrayList<>(); | ||
List<Row> rows = testSparkDeltaTable.insertRows(50); | ||
Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); | ||
allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); | ||
Map<String, AddFile> tableFiles = collectActiveFilesAfterCommit(testSparkDeltaTable); | ||
testTableStates.add(new TableState(tableFiles, Collections.emptyList())); | ||
|
||
List<Row> rows1 = testSparkDeltaTable.insertRows(50); | ||
allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); | ||
tableFiles = collectActiveFilesAfterCommit(testSparkDeltaTable); | ||
testTableStates.add(new TableState(tableFiles)); | ||
validateDeletedRecordCount(testSparkDeltaTable.getDeltaLog(), 0, 0); | ||
assertEquals(100L, testSparkDeltaTable.getNumRows()); | ||
validateDeletedRecordCount(testSparkDeltaTable.getDeltaLog(), allActiveFiles.size() + 1, 0, 0); | ||
|
||
// upsert does not create delete vectors | ||
testSparkDeltaTable.upsertRows(rows.subList(0, 20)); | ||
allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); | ||
tableFiles = collectActiveFilesAfterCommit(testSparkDeltaTable); | ||
testTableStates.add(new TableState(tableFiles)); | ||
validateDeletedRecordCount(testSparkDeltaTable.getDeltaLog(), 0, 0); | ||
assertEquals(100L, testSparkDeltaTable.getNumRows()); | ||
validateDeletedRecordCount(testSparkDeltaTable.getDeltaLog(), allActiveFiles.size() + 1, 0, 0); | ||
|
||
testSparkDeltaTable.insertRows(50); | ||
allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); | ||
tableFiles = collectActiveFilesAfterCommit(testSparkDeltaTable); | ||
testTableStates.add(new TableState(tableFiles)); | ||
validateDeletedRecordCount(testSparkDeltaTable.getDeltaLog(), 0, 0); | ||
assertEquals(150L, testSparkDeltaTable.getNumRows()); | ||
|
||
// delete a few rows with gaps in ids | ||
|
@@ -133,40 +159,48 @@ public void testInsertsUpsertsAndDeletes() { | |
.collect(Collectors.toList()); | ||
rowsToDelete.addAll(rows.subList(35, 45)); | ||
testSparkDeltaTable.deleteRows(rowsToDelete); | ||
allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); | ||
tableFiles = collectActiveFilesAfterCommit(testSparkDeltaTable); | ||
testTableStates.add(new TableState(tableFiles, rowsToDelete)); | ||
validateDeletedRecordCount(testSparkDeltaTable.getDeltaLog(), 2, 15); | ||
assertEquals(135L, testSparkDeltaTable.getNumRows()); | ||
validateDeletedRecordCount(testSparkDeltaTable.getDeltaLog(), allActiveFiles.size() + 1, 2, 15); | ||
|
||
testSparkDeltaTable.insertRows(50); | ||
allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); | ||
tableFiles = collectActiveFilesAfterCommit(testSparkDeltaTable); | ||
testTableStates.add(new TableState(tableFiles)); | ||
validateDeletedRecordCount(testSparkDeltaTable.getDeltaLog(), 2, 15); | ||
assertEquals(185L, testSparkDeltaTable.getNumRows()); | ||
|
||
// delete a few rows from a file which already has a deletion vector, this should generate a | ||
// merged deletion vector file. Some rows were already deleted in the previous delete step. | ||
// This deletion step intentionally deletes the same rows again to test the merge. | ||
rowsToDelete = rows1.subList(5, 15); | ||
testSparkDeltaTable.deleteRows(rowsToDelete); | ||
allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); | ||
tableFiles = collectActiveFilesAfterCommit(testSparkDeltaTable); | ||
testTableStates.add(new TableState(tableFiles, rowsToDelete)); | ||
validateDeletedRecordCount(testSparkDeltaTable.getDeltaLog(), 2, 22); | ||
assertEquals(178L, testSparkDeltaTable.getNumRows()); | ||
validateDeletedRecordCount(testSparkDeltaTable.getDeltaLog(), allActiveFiles.size() + 1, 2, 22); | ||
|
||
testSparkDeltaTable.insertRows(50); | ||
allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); | ||
tableFiles = collectActiveFilesAfterCommit(testSparkDeltaTable); | ||
testTableStates.add(new TableState(tableFiles)); | ||
validateDeletedRecordCount(testSparkDeltaTable.getDeltaLog(), 2, 22); | ||
assertEquals(228L, testSparkDeltaTable.getNumRows()); | ||
|
||
String tableBasePath = testSparkDeltaTable.getBasePath(); | ||
SourceTable tableConfig = | ||
SourceTable.builder() | ||
.name(testSparkDeltaTable.getTableName()) | ||
.basePath(testSparkDeltaTable.getBasePath()) | ||
.basePath(tableBasePath) | ||
.formatName(TableFormat.DELTA) | ||
.build(); | ||
DeltaConversionSource conversionSource = | ||
conversionSourceProvider.getConversionSourceInstance(tableConfig); | ||
InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); | ||
|
||
// validateDeltaPartitioning(internalSnapshot); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you remove this comment? |
||
ValidationTestHelper.validateSnapshot( | ||
internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); | ||
List<String> activeDataFilePaths = | ||
new ArrayList<>(testTableStates.get(testTableStates.size() - 1).activeFiles.keySet()); | ||
ValidationTestHelper.validateSnapshot(internalSnapshot, activeDataFilePaths); | ||
|
||
// Get changes in incremental format. | ||
InstantsForIncrementalSync instantsForIncrementalSync = | ||
|
@@ -179,13 +213,126 @@ public void testInsertsUpsertsAndDeletes() { | |
TableChange tableChange = conversionSource.getTableChangeForCommit(version); | ||
allTableChanges.add(tableChange); | ||
} | ||
ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); | ||
|
||
List<List<String>> allActiveDataFilePaths = | ||
testTableStates.stream() | ||
.map(s -> s.activeFiles) | ||
.map(Map::keySet) | ||
.map(ArrayList::new) | ||
.collect(Collectors.toList()); | ||
ValidationTestHelper.validateTableChanges(allActiveDataFilePaths, allTableChanges); | ||
|
||
validateDeletionInfo(testTableStates, allTableChanges); | ||
} | ||
|
||
// collects active files in the current snapshot as a map and adds it to the list | ||
private Map<String, AddFile> collectActiveFilesAfterCommit( | ||
TestSparkDeltaTable testSparkDeltaTable) { | ||
Map<String, AddFile> allFiles = | ||
testSparkDeltaTable.getAllActiveFilesInfo().stream() | ||
.collect( | ||
Collectors.toMap( | ||
file -> getAddFileAbsolutePath(file, testSparkDeltaTable.getBasePath()), | ||
file -> file)); | ||
return allFiles; | ||
} | ||
|
||
private void validateDeletionInfo( | ||
List<TableState> testTableStates, List<TableChange> allTableChanges) { | ||
if (allTableChanges.isEmpty() && testTableStates.size() <= 1) { | ||
return; | ||
} | ||
|
||
assertEquals( | ||
allTableChanges.size(), | ||
testTableStates.size() - 1, | ||
"Number of table changes should be equal to number of commits - 1"); | ||
|
||
for (int i = 0; i < allTableChanges.size() - 1; i++) { | ||
Map<String, AddFile> activeFileAfterCommit = testTableStates.get(i + 1).activeFiles; | ||
Map<String, AddFile> activeFileBeforeCommit = testTableStates.get(i).activeFiles; | ||
|
||
Map<String, AddFile> activeFilesWithUpdatedDeleteInfo = | ||
activeFileAfterCommit.entrySet().stream() | ||
.filter(e -> e.getValue().deletionVector() != null) | ||
.filter( | ||
entry -> { | ||
if (activeFileBeforeCommit.get(entry.getKey()) == null) { | ||
return true; | ||
} | ||
if (activeFileBeforeCommit.get(entry.getKey()).deletionVector() == null) { | ||
return true; | ||
} | ||
DeletionVectorDescriptor deletionVectorDescriptor = | ||
activeFileBeforeCommit.get(entry.getKey()).deletionVector(); | ||
return !deletionVectorDescriptor.equals(entry.getValue().deletionVector()); | ||
}) | ||
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); | ||
|
||
if (activeFilesWithUpdatedDeleteInfo.isEmpty()) { | ||
continue; | ||
} | ||
|
||
// validate all new delete vectors are correctly detected | ||
validateDeletionInfoForCommit( | ||
testTableStates.get(i + 1), activeFilesWithUpdatedDeleteInfo, allTableChanges.get(i)); | ||
} | ||
} | ||
|
||
private void validateDeletionInfoForCommit( | ||
TableState tableState, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is unused in the method, is that intentional? |
||
Map<String, AddFile> activeFilesAfterCommit, | ||
TableChange changeDetectedForCommit) { | ||
Map<String, InternalDeletionVector> detectedDeleteInfos = | ||
changeDetectedForCommit.getFilesDiff().getFilesAdded().stream() | ||
.filter(file -> file instanceof InternalDeletionVector) | ||
.map(file -> (InternalDeletionVector) file) | ||
.collect(Collectors.toMap(InternalDeletionVector::dataFilePath, file -> file)); | ||
|
||
Map<String, AddFile> filesWithDeleteVectors = | ||
activeFilesAfterCommit.entrySet().stream() | ||
.filter(file -> file.getValue().deletionVector() != null) | ||
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); | ||
|
||
assertEquals(filesWithDeleteVectors.size(), detectedDeleteInfos.size()); | ||
|
||
for (Map.Entry<String, AddFile> fileWithDeleteVector : filesWithDeleteVectors.entrySet()) { | ||
InternalDeletionVector deleteInfo = detectedDeleteInfos.get(fileWithDeleteVector.getKey()); | ||
assertNotNull(deleteInfo); | ||
DeletionVectorDescriptor deletionVectorDescriptor = | ||
fileWithDeleteVector.getValue().deletionVector(); | ||
assertEquals(deletionVectorDescriptor.cardinality(), deleteInfo.getRecordCount()); | ||
assertEquals(deletionVectorDescriptor.sizeInBytes(), deleteInfo.getFileSizeBytes()); | ||
assertEquals(deletionVectorDescriptor.offset().get(), deleteInfo.offset()); | ||
|
||
String deletionFilePath = | ||
deletionVectorDescriptor | ||
.absolutePath(new org.apache.hadoop.fs.Path(testSparkDeltaTable.getBasePath())) | ||
.toString(); | ||
assertEquals(deletionFilePath, deleteInfo.getPhysicalPath()); | ||
|
||
Iterator<Long> iterator = deleteInfo.ordinalsIterator(); | ||
List<Long> deletes = new ArrayList<>(); | ||
iterator.forEachRemaining(deletes::add); | ||
assertEquals(deletes.size(), deleteInfo.getRecordCount()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also validate the ordinals are correct here? |
||
} | ||
} | ||
|
||
private static String getAddFileAbsolutePath(AddFile file, String tableBasePath) { | ||
String filePath = file.path(); | ||
if (filePath.startsWith(tableBasePath)) { | ||
return filePath; | ||
} | ||
return Paths.get(tableBasePath, file.path()).toString(); | ||
} | ||
|
||
private void validateDeletedRecordCount( | ||
DeltaLog deltaLog, int version, int deleteVectorFileCount, int deletionRecordCount) { | ||
DeltaLog deltaLog, int deleteVectorFileCount, int deletionRecordCount) { | ||
List<AddFile> allFiles = | ||
deltaLog.getSnapshotAt(version, Option.empty()).allFiles().collectAsList(); | ||
deltaLog | ||
.getSnapshotAt(deltaLog.snapshot().version(), Option.empty()) | ||
.allFiles() | ||
.collectAsList(); | ||
List<AddFile> filesWithDeletionVectors = | ||
allFiles.stream().filter(f -> f.deletionVector() != null).collect(Collectors.toList()); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,50 +18,176 @@ | |
|
||
package org.apache.xtable.delta; | ||
|
||
import java.net.URISyntaxException; | ||
import static org.junit.jupiter.api.Assertions.assertArrayEquals; | ||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
import static org.junit.jupiter.api.Assertions.assertFalse; | ||
import static org.junit.jupiter.api.Assertions.assertNotNull; | ||
import static org.junit.jupiter.api.Assertions.assertNull; | ||
import static org.mockito.Mockito.spy; | ||
import static org.mockito.Mockito.when; | ||
|
||
import java.util.Arrays; | ||
import java.util.Iterator; | ||
import java.util.UUID; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.Path; | ||
import org.junit.jupiter.api.Assertions; | ||
import org.junit.jupiter.api.Test; | ||
import org.mockito.Mockito; | ||
|
||
import org.apache.spark.sql.delta.DeltaLog; | ||
import org.apache.spark.sql.delta.Snapshot; | ||
import org.apache.spark.sql.delta.actions.AddFile; | ||
import org.apache.spark.sql.delta.actions.DeletionVectorDescriptor; | ||
import org.apache.spark.sql.delta.deletionvectors.RoaringBitmapArray; | ||
import org.apache.spark.sql.delta.deletionvectors.RoaringBitmapArrayFormat; | ||
|
||
import scala.Option; | ||
|
||
import org.apache.xtable.model.storage.InternalDeletionVector; | ||
|
||
class TestDeltaActionsConverter { | ||
|
||
private final String basePath = "https://container.blob.core.windows.net/tablepath/"; | ||
private final int size = 372; | ||
private final long time = 376; | ||
private final boolean dataChange = true; | ||
private final String stats = ""; | ||
private final int cardinality = 42; | ||
private final int offset = 634; | ||
|
||
@Test | ||
void extractDeletionVector() throws URISyntaxException { | ||
void extractMissingDeletionVector() { | ||
DeltaActionsConverter actionsConverter = DeltaActionsConverter.getInstance(); | ||
|
||
int size = 123; | ||
long time = 234L; | ||
boolean dataChange = true; | ||
String stats = ""; | ||
String filePath = "https://container.blob.core.windows.net/tablepath/file_path"; | ||
String filePath = basePath + "file_path"; | ||
Snapshot snapshot = Mockito.mock(Snapshot.class); | ||
DeltaLog deltaLog = Mockito.mock(DeltaLog.class); | ||
|
||
DeletionVectorDescriptor deletionVector = null; | ||
AddFile addFileAction = | ||
new AddFile(filePath, null, size, time, dataChange, stats, null, deletionVector); | ||
Assertions.assertNull(actionsConverter.extractDeletionVectorFile(snapshot, addFileAction)); | ||
InternalDeletionVector internaldeletionVector = | ||
actionsConverter.extractDeletionVector(snapshot, addFileAction); | ||
assertNull(internaldeletionVector); | ||
} | ||
|
||
deletionVector = | ||
@Test | ||
void extractDeletionVectorInFileAbsolutePath() { | ||
DeltaActionsConverter actionsConverter = spy(DeltaActionsConverter.getInstance()); | ||
|
||
String dataFilePath = "data_file"; | ||
String deleteFilePath = "https://container.blob.core.windows.net/tablepath/delete_file"; | ||
Snapshot snapshot = Mockito.mock(Snapshot.class); | ||
|
||
DeletionVectorDescriptor deletionVector = | ||
DeletionVectorDescriptor.onDiskWithAbsolutePath( | ||
filePath, size, 42, Option.empty(), Option.empty()); | ||
deleteFilePath, size, cardinality, Option.apply(offset), Option.empty()); | ||
|
||
addFileAction = | ||
new AddFile(filePath, null, size, time, dataChange, stats, null, deletionVector); | ||
AddFile addFileAction = | ||
new AddFile(dataFilePath, null, size, time, dataChange, stats, null, deletionVector); | ||
|
||
Configuration conf = new Configuration(); | ||
DeltaLog deltaLog = Mockito.mock(DeltaLog.class); | ||
when(snapshot.deltaLog()).thenReturn(deltaLog); | ||
when(deltaLog.dataPath()).thenReturn(new Path(basePath)); | ||
when(deltaLog.newDeltaHadoopConf()).thenReturn(conf); | ||
|
||
long[] ordinals = {45, 78, 98}; | ||
Mockito.doReturn(ordinals) | ||
.when(actionsConverter) | ||
.parseOrdinalFile(conf, new Path(deleteFilePath), size, offset); | ||
Comment on lines
+89
to
+98
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you pull the common testing setup into a helper method? Similarly, the assertions below can be added to a common method so there are less places update if the assertions need to update due to new field or something like that. |
||
|
||
InternalDeletionVector internaldeletionVector = | ||
actionsConverter.extractDeletionVector(snapshot, addFileAction); | ||
assertNotNull(internaldeletionVector); | ||
assertEquals(basePath + dataFilePath, internaldeletionVector.dataFilePath()); | ||
assertEquals(deleteFilePath, internaldeletionVector.getPhysicalPath()); | ||
assertEquals(offset, internaldeletionVector.offset()); | ||
assertEquals(cardinality, internaldeletionVector.getRecordCount()); | ||
assertEquals(size, internaldeletionVector.getFileSizeBytes()); | ||
assertNull(internaldeletionVector.binaryRepresentation()); | ||
|
||
Iterator<Long> iterator = internaldeletionVector.ordinalsIterator(); | ||
Arrays.stream(ordinals).forEach(o -> assertEquals(o, iterator.next())); | ||
assertFalse(iterator.hasNext()); | ||
} | ||
|
||
@Test | ||
void extractDeletionVectorInFileRelativePath() { | ||
DeltaActionsConverter actionsConverter = spy(DeltaActionsConverter.getInstance()); | ||
|
||
String dataFilePath = "data_file"; | ||
UUID deleteFileId = UUID.randomUUID(); | ||
String deleteFilePath = basePath + "deletion_vector_" + deleteFileId + ".bin"; | ||
Snapshot snapshot = Mockito.mock(Snapshot.class); | ||
|
||
DeletionVectorDescriptor deletionVector = | ||
DeletionVectorDescriptor.onDiskWithRelativePath( | ||
deleteFileId, "", size, cardinality, Option.apply(offset), Option.empty()); | ||
|
||
AddFile addFileAction = | ||
new AddFile(dataFilePath, null, size, time, dataChange, stats, null, deletionVector); | ||
|
||
Configuration conf = new Configuration(); | ||
DeltaLog deltaLog = Mockito.mock(DeltaLog.class); | ||
when(snapshot.deltaLog()).thenReturn(deltaLog); | ||
when(deltaLog.dataPath()).thenReturn(new Path(basePath)); | ||
when(deltaLog.newDeltaHadoopConf()).thenReturn(conf); | ||
|
||
long[] ordinals = {45, 78, 98}; | ||
Mockito.doReturn(ordinals) | ||
.when(actionsConverter) | ||
.parseOrdinalFile(conf, new Path(deleteFilePath), size, offset); | ||
|
||
InternalDeletionVector internaldeletionVector = | ||
actionsConverter.extractDeletionVector(snapshot, addFileAction); | ||
assertNotNull(internaldeletionVector); | ||
assertEquals(basePath + dataFilePath, internaldeletionVector.dataFilePath()); | ||
assertEquals(deleteFilePath, internaldeletionVector.getPhysicalPath()); | ||
assertEquals(offset, internaldeletionVector.offset()); | ||
assertEquals(cardinality, internaldeletionVector.getRecordCount()); | ||
assertEquals(size, internaldeletionVector.getFileSizeBytes()); | ||
assertNull(internaldeletionVector.binaryRepresentation()); | ||
|
||
Iterator<Long> iterator = internaldeletionVector.ordinalsIterator(); | ||
Arrays.stream(ordinals).forEach(o -> assertEquals(o, iterator.next())); | ||
assertFalse(iterator.hasNext()); | ||
} | ||
|
||
@Test | ||
void extractInlineDeletionVector() { | ||
DeltaActionsConverter actionsConverter = spy(DeltaActionsConverter.getInstance()); | ||
|
||
String dataFilePath = "data_file"; | ||
Snapshot snapshot = Mockito.mock(Snapshot.class); | ||
|
||
long[] ordinals = {45, 78, 98}; | ||
RoaringBitmapArray rbm = new RoaringBitmapArray(); | ||
Arrays.stream(ordinals).forEach(rbm::add); | ||
byte[] bytes = rbm.serializeAsByteArray(RoaringBitmapArrayFormat.Portable()); | ||
|
||
DeletionVectorDescriptor deletionVector = | ||
DeletionVectorDescriptor.inlineInLog(bytes, cardinality); | ||
|
||
AddFile addFileAction = | ||
new AddFile(dataFilePath, null, size, time, dataChange, stats, null, deletionVector); | ||
|
||
DeltaLog deltaLog = Mockito.mock(DeltaLog.class); | ||
when(snapshot.deltaLog()).thenReturn(deltaLog); | ||
when(deltaLog.dataPath()).thenReturn(new Path(basePath)); | ||
|
||
InternalDeletionVector internaldeletionVector = | ||
actionsConverter.extractDeletionVector(snapshot, addFileAction); | ||
assertNotNull(internaldeletionVector); | ||
assertEquals(basePath + dataFilePath, internaldeletionVector.dataFilePath()); | ||
assertArrayEquals(bytes, internaldeletionVector.binaryRepresentation()); | ||
assertEquals(cardinality, internaldeletionVector.getRecordCount()); | ||
assertEquals(bytes.length, internaldeletionVector.getFileSizeBytes()); | ||
assertEquals("", internaldeletionVector.getPhysicalPath()); | ||
assertEquals(0, internaldeletionVector.offset()); | ||
|
||
Mockito.when(snapshot.deltaLog()).thenReturn(deltaLog); | ||
Mockito.when(deltaLog.dataPath()) | ||
.thenReturn(new Path("https://container.blob.core.windows.net/tablepath")); | ||
Assertions.assertEquals( | ||
filePath, actionsConverter.extractDeletionVectorFile(snapshot, addFileAction)); | ||
Iterator<Long> iterator = internaldeletionVector.ordinalsIterator(); | ||
Arrays.stream(ordinals).forEach(o -> assertEquals(o, iterator.next())); | ||
assertFalse(iterator.hasNext()); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently when this field is set to a non-null value the
ordinalsIterator
is also set. I think it may be cleaner to remove this and rely directly on theordinalsIterator
. Is there something in the future though where this may be used directly?My main worry is that future developers implementing support for deletion vectors may eagerly parse the data into this field.