Skip to content

Commit ad5dc14

Browse files
KontinuationsteveloughranKristin Cowalcijk
authored
Build: Remove Hadoop 2 (#12348)
* Move Iceberg to hadoop 3 Now that the minimum java version is 11, it's impossible for Iceberg to work on a Hadoop release less than 3.3.0. Removing the hadoop2 version and libraries forces all building and testing onto a compatible version, and permits followup work using modern hadoop APIs. Co-authored-by: Kristin Cowalcijk <[email protected]> Co-authored-by: Steve Loughran <[email protected]> * Fix test failures for Spark --------- Co-authored-by: Steve Loughran <[email protected]> Co-authored-by: Kristin Cowalcijk <[email protected]>
1 parent 2f88ff6 commit ad5dc14

File tree

12 files changed

+59
-45
lines changed

12 files changed

+59
-45
lines changed

aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,9 @@
3737
import java.security.NoSuchAlgorithmException;
3838
import java.util.Comparator;
3939
import java.util.List;
40-
import java.util.Locale;
4140
import java.util.Map;
4241
import java.util.stream.Stream;
43-
import org.apache.directory.api.util.Hex;
42+
import org.apache.commons.codec.binary.Hex;
4443
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
4544
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
4645
import org.apache.iceberg.relocated.com.google.common.io.ByteStreams;
@@ -87,7 +86,7 @@ static String md5sum(InputStream is) throws IOException {
8786
while ((numBytes = is.read(bytes)) != -1) {
8887
md.update(bytes, 0, numBytes);
8988
}
90-
return new String(Hex.encodeHex(md.digest())).toUpperCase(Locale.ROOT);
89+
return Hex.encodeHexString(md.digest(), false);
9190
}
9291

9392
private static void inputStreamToFile(InputStream inputStream, File targetFile)

build.gradle

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ project(':iceberg-core') {
348348
implementation libs.jackson.databind
349349
implementation libs.caffeine
350350
implementation libs.roaringbitmap
351-
compileOnly(libs.hadoop2.client) {
351+
compileOnly(libs.hadoop3.client) {
352352
exclude group: 'org.apache.avro', module: 'avro'
353353
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
354354
}
@@ -373,7 +373,7 @@ project(':iceberg-data') {
373373
implementation project(':iceberg-core')
374374
compileOnly project(':iceberg-parquet')
375375
compileOnly project(':iceberg-orc')
376-
compileOnly(libs.hadoop2.common) {
376+
compileOnly(libs.hadoop3.common) {
377377
exclude group: 'commons-beanutils'
378378
exclude group: 'org.apache.avro', module: 'avro'
379379
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
@@ -396,7 +396,7 @@ project(':iceberg-data') {
396396

397397
compileOnly libs.avro.avro
398398

399-
testImplementation(libs.hadoop2.client) {
399+
testImplementation(libs.hadoop3.client) {
400400
exclude group: 'org.apache.avro', module: 'avro'
401401
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
402402
}
@@ -427,7 +427,7 @@ project(':iceberg-aliyun') {
427427
compileOnly libs.jaxb.api
428428
compileOnly libs.activation
429429
compileOnly libs.jaxb.runtime
430-
compileOnly(libs.hadoop2.common) {
430+
compileOnly(libs.hadoop3.common) {
431431
exclude group: 'org.apache.avro', module: 'avro'
432432
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
433433
exclude group: 'javax.servlet', module: 'servlet-api'
@@ -470,7 +470,7 @@ project(':iceberg-aws') {
470470
compileOnly("software.amazon.awssdk:dynamodb")
471471
compileOnly("software.amazon.awssdk:lakeformation")
472472

473-
compileOnly(libs.hadoop2.common) {
473+
compileOnly(libs.hadoop3.common) {
474474
exclude group: 'org.apache.avro', module: 'avro'
475475
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
476476
exclude group: 'javax.servlet', module: 'servlet-api'
@@ -572,7 +572,7 @@ project(':iceberg-delta-lake') {
572572

573573
compileOnly "io.delta:delta-standalone_${scalaVersion}:${libs.versions.delta.standalone.get()}"
574574

575-
compileOnly(libs.hadoop2.common) {
575+
compileOnly(libs.hadoop3.common) {
576576
exclude group: 'org.apache.avro', module: 'avro'
577577
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
578578
exclude group: 'javax.servlet', module: 'servlet-api'
@@ -584,7 +584,7 @@ project(':iceberg-delta-lake') {
584584
if (sparkVersions.contains("3.5")) {
585585
integrationImplementation "io.delta:delta-spark_${scalaVersion}:${libs.versions.delta.spark.get()}"
586586
integrationImplementation project(path: ":iceberg-spark:iceberg-spark-3.5_${scalaVersion}")
587-
integrationImplementation(libs.hadoop2.minicluster) {
587+
integrationImplementation(libs.hadoop3.minicluster) {
588588
exclude group: 'org.apache.avro', module: 'avro'
589589
// to make sure netty libs only come from project(':iceberg-arrow')
590590
exclude group: 'io.netty', module: 'netty-buffer'
@@ -645,7 +645,7 @@ project(':iceberg-gcp') {
645645
testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
646646
testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts')
647647

648-
testImplementation(libs.hadoop2.common) {
648+
testImplementation(libs.hadoop3.common) {
649649
exclude group: 'org.apache.avro', module: 'avro'
650650
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
651651
exclude group: 'javax.servlet', module: 'servlet-api'
@@ -722,7 +722,7 @@ project(':iceberg-hive-metastore') {
722722
exclude group: 'com.zaxxer', module: 'HikariCP'
723723
}
724724

725-
compileOnly(libs.hadoop2.client) {
725+
compileOnly(libs.hadoop3.client) {
726726
exclude group: 'org.apache.avro', module: 'avro'
727727
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
728728
}
@@ -754,12 +754,12 @@ project(':iceberg-orc') {
754754
exclude group: 'org.apache.hive', module: 'hive-storage-api'
755755
}
756756

757-
compileOnly(libs.hadoop2.common) {
757+
compileOnly(libs.hadoop3.common) {
758758
exclude group: 'commons-beanutils'
759759
exclude group: 'org.apache.avro', module: 'avro'
760760
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
761761
}
762-
compileOnly(libs.hadoop2.client) {
762+
compileOnly(libs.hadoop3.client) {
763763
exclude group: 'org.apache.avro', module: 'avro'
764764
}
765765

@@ -788,7 +788,7 @@ project(':iceberg-parquet') {
788788
}
789789

790790
compileOnly libs.avro.avro
791-
compileOnly(libs.hadoop2.client) {
791+
compileOnly(libs.hadoop3.client) {
792792
exclude group: 'org.apache.avro', module: 'avro'
793793
}
794794

@@ -832,8 +832,8 @@ project(':iceberg-arrow') {
832832
// We import :netty-common through :arrow-memory-netty
833833
// so that the same version as used by the :arrow-memory-netty module is picked.
834834
testImplementation libs.arrow.memory.netty
835-
testImplementation libs.hadoop2.common
836-
testImplementation libs.hadoop2.mapreduce.client.core
835+
testImplementation libs.hadoop3.common
836+
testImplementation libs.hadoop3.mapreduce.client.core
837837
}
838838
}
839839

@@ -854,7 +854,7 @@ project(':iceberg-nessie') {
854854
implementation libs.jackson.core
855855
implementation libs.jackson.databind
856856

857-
compileOnly libs.hadoop2.common
857+
compileOnly libs.hadoop3.common
858858
// Only there to prevent "warning: unknown enum constant SchemaType.OBJECT" compile messages
859859
compileOnly libs.microprofile.openapi.api
860860

flink/v1.18/build.gradle

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") {
4242
compileOnly libs.flink118.connector.base
4343
compileOnly libs.flink118.connector.files
4444

45-
compileOnly libs.hadoop2.hdfs
46-
compileOnly libs.hadoop2.common
47-
compileOnly(libs.hadoop2.minicluster) {
45+
compileOnly libs.hadoop3.hdfs
46+
compileOnly libs.hadoop3.common
47+
compileOnly(libs.hadoop3.minicluster) {
4848
exclude group: 'org.apache.avro', module: 'avro'
4949
}
5050

@@ -186,9 +186,9 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") {
186186
integrationImplementation libs.flink118.table.api.java.bridge
187187
integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink118.get()}"
188188

189-
integrationImplementation libs.hadoop2.common
190-
integrationImplementation libs.hadoop2.hdfs
191-
integrationImplementation(libs.hadoop2.minicluster) {
189+
integrationImplementation libs.hadoop3.common
190+
integrationImplementation libs.hadoop3.hdfs
191+
integrationImplementation(libs.hadoop3.minicluster) {
192192
exclude group: 'org.apache.avro', module: 'avro'
193193
}
194194

flink/v1.19/build.gradle

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") {
4242
compileOnly libs.flink119.connector.base
4343
compileOnly libs.flink119.connector.files
4444

45-
compileOnly libs.hadoop2.hdfs
46-
compileOnly libs.hadoop2.common
47-
compileOnly(libs.hadoop2.minicluster) {
45+
compileOnly libs.hadoop3.hdfs
46+
compileOnly libs.hadoop3.common
47+
compileOnly(libs.hadoop3.minicluster) {
4848
exclude group: 'org.apache.avro', module: 'avro'
4949
}
5050

@@ -187,9 +187,9 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") {
187187
integrationImplementation libs.flink119.table.api.java.bridge
188188
integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink119.get()}"
189189

190-
integrationImplementation libs.hadoop2.common
191-
integrationImplementation libs.hadoop2.hdfs
192-
integrationImplementation(libs.hadoop2.minicluster) {
190+
integrationImplementation libs.hadoop3.common
191+
integrationImplementation libs.hadoop3.hdfs
192+
integrationImplementation(libs.hadoop3.minicluster) {
193193
exclude group: 'org.apache.avro', module: 'avro'
194194
}
195195

flink/v1.20/build.gradle

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") {
4242
compileOnly libs.flink120.connector.base
4343
compileOnly libs.flink120.connector.files
4444

45-
compileOnly libs.hadoop2.hdfs
46-
compileOnly libs.hadoop2.common
47-
compileOnly(libs.hadoop2.minicluster) {
45+
compileOnly libs.hadoop3.hdfs
46+
compileOnly libs.hadoop3.common
47+
compileOnly(libs.hadoop3.minicluster) {
4848
exclude group: 'org.apache.avro', module: 'avro'
4949
}
5050

@@ -187,9 +187,9 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") {
187187
integrationImplementation libs.flink120.table.api.java.bridge
188188
integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink120.get()}"
189189

190-
integrationImplementation libs.hadoop2.common
191-
integrationImplementation libs.hadoop2.hdfs
192-
integrationImplementation(libs.hadoop2.minicluster) {
190+
integrationImplementation libs.hadoop3.common
191+
integrationImplementation libs.hadoop3.hdfs
192+
integrationImplementation(libs.hadoop3.minicluster) {
193193
exclude group: 'org.apache.avro', module: 'avro'
194194
}
195195

gradle/libs.versions.toml

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ flink119 = { strictly = "1.19.1"}
4747
flink120 = { strictly = "1.20.0"}
4848
google-libraries-bom = "26.55.0"
4949
guava = "33.4.0-jre"
50-
hadoop2 = "2.7.3"
5150
hadoop3 = "3.4.1"
5251
httpcomponents-httpclient5 = "5.4.2"
5352
hive2 = { strictly = "2.3.10"} # see rich version usage explanation above
@@ -126,13 +125,11 @@ flink120-streaming-java = { module = "org.apache.flink:flink-streaming-java", ve
126125
flink120-table-api-java-bridge = { module = "org.apache.flink:flink-table-api-java-bridge", version.ref = "flink120" }
127126
google-libraries-bom = { module = "com.google.cloud:libraries-bom", version.ref = "google-libraries-bom" }
128127
guava-guava = { module = "com.google.guava:guava", version.ref = "guava" }
129-
hadoop2-client = { module = "org.apache.hadoop:hadoop-client", version.ref = "hadoop2" }
130-
hadoop2-common = { module = "org.apache.hadoop:hadoop-common", version.ref = "hadoop2" }
131-
hadoop2-hdfs = { module = "org.apache.hadoop:hadoop-hdfs", version.ref = "hadoop2" }
132-
hadoop2-mapreduce-client-core = { module = "org.apache.hadoop:hadoop-mapreduce-client-core", version.ref = "hadoop2" }
133-
hadoop2-minicluster = { module = "org.apache.hadoop:hadoop-minicluster", version.ref = "hadoop2" }
134128
hadoop3-client = { module = "org.apache.hadoop:hadoop-client", version.ref = "hadoop3" }
135129
hadoop3-common = { module = "org.apache.hadoop:hadoop-common", version.ref = "hadoop3" }
130+
hadoop3-hdfs = { module = "org.apache.hadoop:hadoop-hdfs", version.ref = "hadoop3" }
131+
hadoop3-mapreduce-client-core = { module = "org.apache.hadoop:hadoop-mapreduce-client-core", version.ref = "hadoop3" }
132+
hadoop3-minicluster = { module = "org.apache.hadoop:hadoop-minicluster", version.ref = "hadoop3" }
136133
hive2-exec = { module = "org.apache.hive:hive-exec", version.ref = "hive2" }
137134
hive2-metastore = { module = "org.apache.hive:hive-metastore", version.ref = "hive2" }
138135
hive2-service = { module = "org.apache.hive:hive-service", version.ref = "hive2" }

mr/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ project(':iceberg-mr') {
3737
implementation project(':iceberg-orc')
3838
implementation project(':iceberg-parquet')
3939

40-
compileOnly(libs.hadoop2.client) {
40+
compileOnly(libs.hadoop3.client) {
4141
exclude group: 'org.apache.avro', module: 'avro'
4242
}
4343

spark/v3.4/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") {
9696

9797
implementation libs.caffeine
9898

99-
testImplementation(libs.hadoop2.minicluster) {
99+
testImplementation(libs.hadoop3.minicluster) {
100100
exclude group: 'org.apache.avro', module: 'avro'
101101
// to make sure netty libs only come from project(':iceberg-arrow')
102102
exclude group: 'io.netty', module: 'netty-buffer'

spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCompressionSettings.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
import org.apache.spark.sql.Row;
7171
import org.apache.spark.sql.SparkSession;
7272
import org.junit.AfterClass;
73+
import org.junit.Before;
7374
import org.junit.BeforeClass;
7475
import org.junit.Rule;
7576
import org.junit.Test;
@@ -106,6 +107,13 @@ public static void startSpark() {
106107
TestCompressionSettings.spark = SparkSession.builder().master("local[2]").getOrCreate();
107108
}
108109

110+
@Before
111+
public void resetSpecificConfigurations() {
112+
spark.conf().unset(COMPRESSION_CODEC);
113+
spark.conf().unset(COMPRESSION_LEVEL);
114+
spark.conf().unset(COMPRESSION_STRATEGY);
115+
}
116+
109117
@Parameterized.AfterParam
110118
public static void clearSourceCache() {
111119
spark.sql(String.format("DROP TABLE IF EXISTS %s", TABLE_NAME));

spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
import static org.assertj.core.api.Assertions.assertThatThrownBy;
2323

2424
import java.io.File;
25+
import java.nio.file.Files;
26+
import java.nio.file.Paths;
2527
import java.util.List;
2628
import org.apache.hadoop.conf.Configuration;
2729
import org.apache.iceberg.PartitionSpec;
@@ -118,6 +120,7 @@ public void testStreamingWriteAppendMode() throws Exception {
118120
// remove the last commit to force Spark to reprocess batch #1
119121
File lastCommitFile = new File(checkpoint + "/commits/1");
120122
Assert.assertTrue("The commit file must be deleted", lastCommitFile.delete());
123+
Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc"));
121124

122125
// restart the query from the checkpoint
123126
StreamingQuery restartedQuery = streamWriter.start();
@@ -178,6 +181,7 @@ public void testStreamingWriteCompleteMode() throws Exception {
178181
// remove the last commit to force Spark to reprocess batch #1
179182
File lastCommitFile = new File(checkpoint + "/commits/1");
180183
Assert.assertTrue("The commit file must be deleted", lastCommitFile.delete());
184+
Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc"));
181185

182186
// restart the query from the checkpoint
183187
StreamingQuery restartedQuery = streamWriter.start();
@@ -238,6 +242,7 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception {
238242
// remove the last commit to force Spark to reprocess batch #1
239243
File lastCommitFile = new File(checkpoint + "/commits/1");
240244
Assert.assertTrue("The commit file must be deleted", lastCommitFile.delete());
245+
Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc"));
241246

242247
// restart the query from the checkpoint
243248
StreamingQuery restartedQuery = streamWriter.start();

spark/v3.5/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") {
9696

9797
implementation libs.caffeine
9898

99-
testImplementation(libs.hadoop2.minicluster) {
99+
testImplementation(libs.hadoop3.minicluster) {
100100
exclude group: 'org.apache.avro', module: 'avro'
101101
// to make sure netty libs only come from project(':iceberg-arrow')
102102
exclude group: 'io.netty', module: 'netty-buffer'

spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
import static org.assertj.core.api.Assertions.assertThatThrownBy;
2424

2525
import java.io.File;
26+
import java.nio.file.Files;
2627
import java.nio.file.Path;
28+
import java.nio.file.Paths;
2729
import java.util.List;
2830
import org.apache.hadoop.conf.Configuration;
2931
import org.apache.iceberg.PartitionSpec;
@@ -117,6 +119,7 @@ public void testStreamingWriteAppendMode() throws Exception {
117119
// remove the last commit to force Spark to reprocess batch #1
118120
File lastCommitFile = new File(checkpoint + "/commits/1");
119121
assertThat(lastCommitFile.delete()).as("The commit file must be deleted").isTrue();
122+
Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc"));
120123

121124
// restart the query from the checkpoint
122125
StreamingQuery restartedQuery = streamWriter.start();
@@ -178,6 +181,7 @@ public void testStreamingWriteCompleteMode() throws Exception {
178181
// remove the last commit to force Spark to reprocess batch #1
179182
File lastCommitFile = new File(checkpoint + "/commits/1");
180183
assertThat(lastCommitFile.delete()).as("The commit file must be deleted").isTrue();
184+
Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc"));
181185

182186
// restart the query from the checkpoint
183187
StreamingQuery restartedQuery = streamWriter.start();
@@ -239,6 +243,7 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception {
239243
// remove the last commit to force Spark to reprocess batch #1
240244
File lastCommitFile = new File(checkpoint + "/commits/1");
241245
assertThat(lastCommitFile.delete()).as("The commit file must be deleted").isTrue();
246+
Files.deleteIfExists(Paths.get(checkpoint + "/commits/.1.crc"));
242247

243248
// restart the query from the checkpoint
244249
StreamingQuery restartedQuery = streamWriter.start();

0 commit comments

Comments
 (0)