Build: Remove Hadoop 2 (#12348)

Kontinuation · steveloughran · Kristin Cowalcijk · web-flow · commit ad5dc14cc3e2 · 2025-02-26T12:53:33.000+01:00
* Move Iceberg to hadoop 3

Now that the minimum java version is 11, it's impossible for
Iceberg to work on a Hadoop release less than 3.3.0.

Removing the hadoop2 version and libraries forces all building
and testing onto a compatible version, and permits followup
work using modern hadoop APIs.

Co-authored-by: Kristin Cowalcijk &lt;kontinuation@apache.com&gt;
Co-authored-by: Steve Loughran &lt;stevel@cloudera.com&gt;

* Fix test failures for Spark

---------

Co-authored-by: Steve Loughran &lt;stevel@cloudera.com&gt;
Co-authored-by: Kristin Cowalcijk &lt;kontinuation@apache.com&gt;
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java
@@ -37,10 +37,9 @@
 import java.security.NoSuchAlgorithmException;
 import java.util.Comparator;
 import java.util.List;
-import java.util.Locale;
 import java.util.Map;
 import java.util.stream.Stream;
-import org.apache.directory.api.util.Hex;
+import org.apache.commons.codec.binary.Hex;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.io.ByteStreams;
@@ -87,7 +86,7 @@ static String md5sum(InputStream is) throws IOException {
     while ((numBytes = is.read(bytes)) != -1) {
       md.update(bytes, 0, numBytes);
     }
-    return new String(Hex.encodeHex(md.digest())).toUpperCase(Locale.ROOT);
+    return Hex.encodeHexString(md.digest(), false);
   }
 
   private static void inputStreamToFile(InputStream inputStream, File targetFile)
diff --git a/build.gradle b/build.gradle
@@ -348,7 +348,7 @@ project(':iceberg-core') {
     implementation libs.jackson.databind
     implementation libs.caffeine
     implementation libs.roaringbitmap
-    compileOnly(libs.hadoop2.client) {
+    compileOnly(libs.hadoop3.client) {
       exclude group: 'org.apache.avro', module: 'avro'
       exclude group: 'org.slf4j', module: 'slf4j-log4j12'
     }
@@ -373,7 +373,7 @@ project(':iceberg-data') {
     implementation project(':iceberg-core')
     compileOnly project(':iceberg-parquet')
     compileOnly project(':iceberg-orc')
-    compileOnly(libs.hadoop2.common) {
+    compileOnly(libs.hadoop3.common) {
       exclude group: 'commons-beanutils'
       exclude group: 'org.apache.avro', module: 'avro'
       exclude group: 'org.slf4j', module: 'slf4j-log4j12'
@@ -396,7 +396,7 @@ project(':iceberg-data') {
 
     compileOnly libs.avro.avro
 
-    testImplementation(libs.hadoop2.client) {
+    testImplementation(libs.hadoop3.client) {
       exclude group: 'org.apache.avro', module: 'avro'
       exclude group: 'org.slf4j', module: 'slf4j-log4j12'
     }
@@ -427,7 +427,7 @@ project(':iceberg-aliyun') {
     compileOnly libs.jaxb.api
     compileOnly libs.activation
     compileOnly libs.jaxb.runtime
-    compileOnly(libs.hadoop2.common) {
+    compileOnly(libs.hadoop3.common) {
       exclude group: 'org.apache.avro', module: 'avro'
       exclude group: 'org.slf4j', module: 'slf4j-log4j12'
       exclude group: 'javax.servlet', module: 'servlet-api'
@@ -470,7 +470,7 @@ project(':iceberg-aws') {
     compileOnly("software.amazon.awssdk:dynamodb")
     compileOnly("software.amazon.awssdk:lakeformation")
 
-    compileOnly(libs.hadoop2.common) {
+    compileOnly(libs.hadoop3.common) {
       exclude group: 'org.apache.avro', module: 'avro'
       exclude group: 'org.slf4j', module: 'slf4j-log4j12'
       exclude group: 'javax.servlet', module: 'servlet-api'
@@ -572,7 +572,7 @@ project(':iceberg-delta-lake') {
 
     compileOnly "io.delta:delta-standalone_${scalaVersion}:${libs.versions.delta.standalone.get()}"
 
-    compileOnly(libs.hadoop2.common) {
+    compileOnly(libs.hadoop3.common) {
       exclude group: 'org.apache.avro', module: 'avro'
       exclude group: 'org.slf4j', module: 'slf4j-log4j12'
       exclude group: 'javax.servlet', module: 'servlet-api'
@@ -584,7 +584,7 @@ project(':iceberg-delta-lake') {
     if (sparkVersions.contains("3.5")) {
       integrationImplementation "io.delta:delta-spark_${scalaVersion}:${libs.versions.delta.spark.get()}"
       integrationImplementation project(path: ":iceberg-spark:iceberg-spark-3.5_${scalaVersion}")
-      integrationImplementation(libs.hadoop2.minicluster) {
+      integrationImplementation(libs.hadoop3.minicluster) {
         exclude group: 'org.apache.avro', module: 'avro'
         // to make sure netty libs only come from project(':iceberg-arrow')
         exclude group: 'io.netty', module: 'netty-buffer'
@@ -645,7 +645,7 @@ project(':iceberg-gcp') {
     testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
     testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts')
 
-    testImplementation(libs.hadoop2.common) {
+    testImplementation(libs.hadoop3.common) {
       exclude group: 'org.apache.avro', module: 'avro'
       exclude group: 'org.slf4j', module: 'slf4j-log4j12'
       exclude group: 'javax.servlet', module: 'servlet-api'
@@ -722,7 +722,7 @@ project(':iceberg-hive-metastore') {
       exclude group: 'com.zaxxer', module: 'HikariCP'
     }
 
-    compileOnly(libs.hadoop2.client) {
+    compileOnly(libs.hadoop3.client) {
       exclude group: 'org.apache.avro', module: 'avro'
       exclude group: 'org.slf4j', module: 'slf4j-log4j12'
     }
@@ -754,12 +754,12 @@ project(':iceberg-orc') {
       exclude group: 'org.apache.hive', module: 'hive-storage-api'
     }
 
-    compileOnly(libs.hadoop2.common) {
+    compileOnly(libs.hadoop3.common) {
       exclude group: 'commons-beanutils'
       exclude group: 'org.apache.avro', module: 'avro'
       exclude group: 'org.slf4j', module: 'slf4j-log4j12'
     }
-    compileOnly(libs.hadoop2.client) {
+    compileOnly(libs.hadoop3.client) {
       exclude group: 'org.apache.avro', module: 'avro'
     }
 
@@ -788,7 +788,7 @@ project(':iceberg-parquet') {
     }
 
     compileOnly libs.avro.avro
-    compileOnly(libs.hadoop2.client) {
+    compileOnly(libs.hadoop3.client) {
       exclude group: 'org.apache.avro', module: 'avro'
     }
 
@@ -832,8 +832,8 @@ project(':iceberg-arrow') {
     // We import :netty-common through :arrow-memory-netty
     // so that the same version as used by the :arrow-memory-netty module is picked.
     testImplementation libs.arrow.memory.netty
-    testImplementation libs.hadoop2.common
-    testImplementation libs.hadoop2.mapreduce.client.core
+    testImplementation libs.hadoop3.common
+    testImplementation libs.hadoop3.mapreduce.client.core
   }
 }
 
@@ -854,7 +854,7 @@ project(':iceberg-nessie') {
     implementation libs.jackson.core
     implementation libs.jackson.databind
 
-    compileOnly libs.hadoop2.common
+    compileOnly libs.hadoop3.common
     // Only there to prevent "warning: unknown enum constant SchemaType.OBJECT" compile messages
     compileOnly libs.microprofile.openapi.api
 
diff --git a/flink/v1.18/build.gradle b/flink/v1.18/build.gradle
@@ -42,9 +42,9 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") {
     compileOnly libs.flink118.connector.base
     compileOnly libs.flink118.connector.files
 
-    compileOnly libs.hadoop2.hdfs
-    compileOnly libs.hadoop2.common
-    compileOnly(libs.hadoop2.minicluster) {
+    compileOnly libs.hadoop3.hdfs
+    compileOnly libs.hadoop3.common
+    compileOnly(libs.hadoop3.minicluster) {
       exclude group: 'org.apache.avro', module: 'avro'
     }
 
@@ -186,9 +186,9 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") {
     integrationImplementation libs.flink118.table.api.java.bridge
     integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink118.get()}"
 
-    integrationImplementation libs.hadoop2.common
-    integrationImplementation libs.hadoop2.hdfs
-    integrationImplementation(libs.hadoop2.minicluster) {
+    integrationImplementation libs.hadoop3.common
+    integrationImplementation libs.hadoop3.hdfs
+    integrationImplementation(libs.hadoop3.minicluster) {
       exclude group: 'org.apache.avro', module: 'avro'
     }
 
diff --git a/flink/v1.19/build.gradle b/flink/v1.19/build.gradle
@@ -42,9 +42,9 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") {
     compileOnly libs.flink119.connector.base
     compileOnly libs.flink119.connector.files
 
-    compileOnly libs.hadoop2.hdfs
-    compileOnly libs.hadoop2.common
-    compileOnly(libs.hadoop2.minicluster) {
+    compileOnly libs.hadoop3.hdfs
+    compileOnly libs.hadoop3.common
+    compileOnly(libs.hadoop3.minicluster) {
       exclude group: 'org.apache.avro', module: 'avro'
     }
 
@@ -187,9 +187,9 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") {
     integrationImplementation libs.flink119.table.api.java.bridge
     integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink119.get()}"
 
-    integrationImplementation libs.hadoop2.common
-    integrationImplementation libs.hadoop2.hdfs
-    integrationImplementation(libs.hadoop2.minicluster) {
+    integrationImplementation libs.hadoop3.common
+    integrationImplementation libs.hadoop3.hdfs
+    integrationImplementation(libs.hadoop3.minicluster) {
       exclude group: 'org.apache.avro', module: 'avro'
     }
 
diff --git a/flink/v1.20/build.gradle b/flink/v1.20/build.gradle
@@ -42,9 +42,9 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") {
     compileOnly libs.flink120.connector.base
     compileOnly libs.flink120.connector.files
 
-    compileOnly libs.hadoop2.hdfs
-    compileOnly libs.hadoop2.common
-    compileOnly(libs.hadoop2.minicluster) {
+    compileOnly libs.hadoop3.hdfs
+    compileOnly libs.hadoop3.common
+    compileOnly(libs.hadoop3.minicluster) {
       exclude group: 'org.apache.avro', module: 'avro'
     }
 
@@ -187,9 +187,9 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") {
     integrationImplementation libs.flink120.table.api.java.bridge
     integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink120.get()}"
 
-    integrationImplementation libs.hadoop2.common
-    integrationImplementation libs.hadoop2.hdfs
-    integrationImplementation(libs.hadoop2.minicluster) {
+    integrationImplementation libs.hadoop3.common
+    integrationImplementation libs.hadoop3.hdfs
+    integrationImplementation(libs.hadoop3.minicluster) {
       exclude group: 'org.apache.avro', module: 'avro'
     }
 
diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
@@ -47,7 +47,6 @@ flink119 =  { strictly = "1.19.1"}
 flink120 = { strictly = "1.20.0"}
 google-libraries-bom = "26.55.0"
 guava = "33.4.0-jre"
-hadoop2 = "2.7.3"
 hadoop3 = "3.4.1"
 httpcomponents-httpclient5 = "5.4.2"
 hive2 = { strictly = "2.3.10"} # see rich version usage explanation above
@@ -126,13 +125,11 @@ flink120-streaming-java = { module = "org.apache.flink:flink-streaming-java", ve
 flink120-table-api-java-bridge = { module = "org.apache.flink:flink-table-api-java-bridge", version.ref = "flink120" }
 google-libraries-bom = { module = "com.google.cloud:libraries-bom", version.ref = "google-libraries-bom" }
 guava-guava = { module = "com.google.guava:guava", version.ref = "guava" }
-hadoop2-client = { module = "org.apache.hadoop:hadoop-client", version.ref = "hadoop2" }
-hadoop2-common = { module = "org.apache.hadoop:hadoop-common", version.ref = "hadoop2" }
-hadoop2-hdfs = { module = "org.apache.hadoop:hadoop-hdfs", version.ref = "hadoop2" }
-hadoop2-mapreduce-client-core = { module = "org.apache.hadoop:hadoop-mapreduce-client-core", version.ref = "hadoop2" }
-hadoop2-minicluster = { module = "org.apache.hadoop:hadoop-minicluster", version.ref = "hadoop2" }
 hadoop3-client = { module = "org.apache.hadoop:hadoop-client", version.ref = "hadoop3" }
 hadoop3-common = { module = "org.apache.hadoop:hadoop-common", version.ref = "hadoop3" }
+hadoop3-hdfs = { module = "org.apache.hadoop:hadoop-hdfs", version.ref = "hadoop3" }
+hadoop3-mapreduce-client-core = { module = "org.apache.hadoop:hadoop-mapreduce-client-core", version.ref = "hadoop3" }
+hadoop3-minicluster = { module = "org.apache.hadoop:hadoop-minicluster", version.ref = "hadoop3" }
 hive2-exec = { module = "org.apache.hive:hive-exec", version.ref = "hive2" }
 hive2-metastore = { module = "org.apache.hive:hive-metastore", version.ref = "hive2" }
 hive2-service = { module = "org.apache.hive:hive-service", version.ref = "hive2" }
diff --git a/mr/build.gradle b/mr/build.gradle
@@ -37,7 +37,7 @@ project(':iceberg-mr') {
     implementation project(':iceberg-orc')
     implementation project(':iceberg-parquet')
 
-    compileOnly(libs.hadoop2.client) {
+    compileOnly(libs.hadoop3.client) {
       exclude group: 'org.apache.avro', module: 'avro'
     }
 
diff --git a/spark/v3.4/build.gradle b/spark/v3.4/build.gradle
@@ -96,7 +96,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") {
 
     implementation libs.caffeine
 
-    testImplementation(libs.hadoop2.minicluster) {
+    testImplementation(libs.hadoop3.minicluster) {
       exclude group: 'org.apache.avro', module: 'avro'
       // to make sure netty libs only come from project(':iceberg-arrow')
       exclude group: 'io.netty', module: 'netty-buffer'
diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCompressionSettings.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCompressionSettings.java
@@ -70,6 +70,7 @@
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
 import org.junit.AfterClass;
+import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Rule;
 import org.junit.Test;
@@ -106,6 +107,13 @@ public static void startSpark() {
     TestCompressionSettings.spark = SparkSession.builder().master("local[2]").getOrCreate();
   }
 
+  @Before
+  public void resetSpecificConfigurations() {
+    spark.conf().unset(COMPRESSION_CODEC);
+    spark.conf().unset(COMPRESSION_LEVEL);
+    spark.conf().unset(COMPRESSION_STRATEGY);
+  }
+
   @Parameterized.AfterParam
   public static void clearSourceCache() {
     spark.sql(String.format("DROP TABLE IF EXISTS %s", TABLE_NAME));
diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java
@@ -22,6 +22,8 @@
 import static org.assertj.core.api.Assertions.assertThatThrownBy;
 
 import java.io.File;
+import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.List;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.iceberg.PartitionSpec;
@@ -118,6 +120,7 @@ public void testStreamingWriteAppendMode() throws Exception {
       // remove the last commit to force Spark to reprocess batch #1
       File lastCommitFile = new File(checkpoint + "/commits/1");
       Assert.assertTrue("The commit file must be deleted", lastCommitFile.delete());
+      Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc"));
 
       // restart the query from the checkpoint
       StreamingQuery restartedQuery = streamWriter.start();
@@ -178,6 +181,7 @@ public void testStreamingWriteCompleteMode() throws Exception {
       // remove the last commit to force Spark to reprocess batch #1
       File lastCommitFile = new File(checkpoint + "/commits/1");
       Assert.assertTrue("The commit file must be deleted", lastCommitFile.delete());
+      Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc"));
 
       // restart the query from the checkpoint
       StreamingQuery restartedQuery = streamWriter.start();
@@ -238,6 +242,7 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception {
       // remove the last commit to force Spark to reprocess batch #1
       File lastCommitFile = new File(checkpoint + "/commits/1");
       Assert.assertTrue("The commit file must be deleted", lastCommitFile.delete());
+      Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc"));
 
       // restart the query from the checkpoint
       StreamingQuery restartedQuery = streamWriter.start();
diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle
@@ -96,7 +96,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") {
 
     implementation libs.caffeine
 
-    testImplementation(libs.hadoop2.minicluster) {
+    testImplementation(libs.hadoop3.minicluster) {
       exclude group: 'org.apache.avro', module: 'avro'
       // to make sure netty libs only come from project(':iceberg-arrow')
       exclude group: 'io.netty', module: 'netty-buffer'
diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java
@@ -23,7 +23,9 @@
 import static org.assertj.core.api.Assertions.assertThatThrownBy;
 
 import java.io.File;
+import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.List;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.iceberg.PartitionSpec;
@@ -117,6 +119,7 @@ public void testStreamingWriteAppendMode() throws Exception {
       // remove the last commit to force Spark to reprocess batch #1
       File lastCommitFile = new File(checkpoint + "/commits/1");
       assertThat(lastCommitFile.delete()).as("The commit file must be deleted").isTrue();
+      Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc"));
 
       // restart the query from the checkpoint
       StreamingQuery restartedQuery = streamWriter.start();
@@ -178,6 +181,7 @@ public void testStreamingWriteCompleteMode() throws Exception {
       // remove the last commit to force Spark to reprocess batch #1
       File lastCommitFile = new File(checkpoint + "/commits/1");
       assertThat(lastCommitFile.delete()).as("The commit file must be deleted").isTrue();
+      Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc"));
 
       // restart the query from the checkpoint
       StreamingQuery restartedQuery = streamWriter.start();
@@ -239,6 +243,7 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception {
       // remove the last commit to force Spark to reprocess batch #1
       File lastCommitFile = new File(checkpoint + "/commits/1");
       assertThat(lastCommitFile.delete()).as("The commit file must be deleted").isTrue();
+      Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc"));
 
       // restart the query from the checkpoint
       StreamingQuery restartedQuery = streamWriter.start();

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ project(':iceberg-mr') {`
`37`	`37`	`implementation project(':iceberg-orc')`
`38`	`38`	`implementation project(':iceberg-parquet')`
`39`	`39`
`40`		`- compileOnly(libs.hadoop2.client) {`
	`40`	`+ compileOnly(libs.hadoop3.client) {`
`41`	`41`	`exclude group: 'org.apache.avro', module: 'avro'`
`42`	`42`	`}`
`43`	`43`