Skip to content

Commit b1352b5

Browse files
kcacademicpivovarit
authored andcommitted
Adding files for the tutorial BAEL-2301 (eugenp#6066)
1 parent a3d6ebe commit b1352b5

File tree

4 files changed

+345
-4
lines changed

4 files changed

+345
-4
lines changed

apache-spark/pom.xml

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,76 @@
1515
</parent>
1616

1717
<dependencies>
18-
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.10 -->
1918
<dependency>
2019
<groupId>org.apache.spark</groupId>
21-
<artifactId>spark-core_2.10</artifactId>
20+
<artifactId>spark-core_2.11</artifactId>
2221
<version>${org.apache.spark.spark-core.version}</version>
22+
<scope>provided</scope>
2323
</dependency>
24+
<dependency>
25+
<groupId>org.apache.spark</groupId>
26+
<artifactId>spark-sql_2.11</artifactId>
27+
<version>${org.apache.spark.spark-sql.version}</version>
28+
<scope>provided</scope>
29+
</dependency>
30+
<dependency>
31+
<groupId>org.apache.spark</groupId>
32+
<artifactId>spark-streaming_2.11</artifactId>
33+
<version>${org.apache.spark.spark-streaming.version}</version>
34+
<scope>provided</scope>
35+
</dependency>
36+
<dependency>
37+
<groupId>org.apache.spark</groupId>
38+
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
39+
<version>${org.apache.spark.spark-streaming-kafka.version}</version>
40+
</dependency>
41+
<dependency>
42+
<groupId>com.datastax.spark</groupId>
43+
<artifactId>spark-cassandra-connector_2.11</artifactId>
44+
<version>${com.datastax.spark.spark-cassandra-connector.version}</version>
45+
</dependency>
46+
<dependency>
47+
<groupId>com.datastax.spark</groupId>
48+
<artifactId>spark-cassandra-connector-java_2.11</artifactId>
49+
<version>${com.datastax.spark.spark-cassandra-connector-java.version}</version>
50+
</dependency>
2451
</dependencies>
25-
52+
<build>
53+
<plugins>
54+
<plugin>
55+
<groupId>org.apache.maven.plugins</groupId>
56+
<artifactId>maven-compiler-plugin</artifactId>
57+
<version>3.2</version>
58+
<configuration>
59+
<source>1.8</source>
60+
<target>1.8</target>
61+
</configuration>
62+
</plugin>
63+
<plugin>
64+
<artifactId>maven-assembly-plugin</artifactId>
65+
<executions>
66+
<execution>
67+
<phase>package</phase>
68+
<goals>
69+
<goal>single</goal>
70+
</goals>
71+
</execution>
72+
</executions>
73+
<configuration>
74+
<descriptorRefs>
75+
<descriptorRef>jar-with-dependencies</descriptorRef>
76+
</descriptorRefs>
77+
</configuration>
78+
</plugin>
79+
</plugins>
80+
</build>
2681
<properties>
27-
<org.apache.spark.spark-core.version>2.2.0</org.apache.spark.spark-core.version>
82+
<org.apache.spark.spark-core.version>2.3.0</org.apache.spark.spark-core.version>
83+
<org.apache.spark.spark-sql.version>2.3.0</org.apache.spark.spark-sql.version>
84+
<org.apache.spark.spark-streaming.version>2.3.0</org.apache.spark.spark-streaming.version>
85+
<org.apache.spark.spark-streaming-kafka.version>2.3.0</org.apache.spark.spark-streaming-kafka.version>
86+
<com.datastax.spark.spark-cassandra-connector.version>2.3.0</com.datastax.spark.spark-cassandra-connector.version>
87+
<com.datastax.spark.spark-cassandra-connector-java.version>1.5.2</com.datastax.spark.spark-cassandra-connector-java.version>
2888
</properties>
2989

3090
</project>
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package com.baeldung.data.pipeline;
2+
3+
import java.io.Serializable;
4+
5+
public class Word implements Serializable {
6+
private static final long serialVersionUID = 1L;
7+
private String word;
8+
private int count;
9+
Word(String word, int count) {
10+
this.word = word;
11+
this.count = count;
12+
}
13+
public String getWord() {
14+
return word;
15+
}
16+
public void setWord(String word) {
17+
this.word = word;
18+
}
19+
public int getCount() {
20+
return count;
21+
}
22+
public void setCount(int count) {
23+
this.count = count;
24+
}
25+
}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
package com.baeldung.data.pipeline;
2+
3+
import static com.datastax.spark.connector.japi.CassandraJavaUtil.javaFunctions;
4+
import static com.datastax.spark.connector.japi.CassandraJavaUtil.mapToRow;
5+
6+
import java.util.Arrays;
7+
import java.util.Collection;
8+
import java.util.HashMap;
9+
import java.util.Iterator;
10+
import java.util.List;
11+
import java.util.Map;
12+
13+
import org.apache.kafka.clients.consumer.ConsumerRecord;
14+
import org.apache.kafka.common.serialization.StringDeserializer;
15+
import org.apache.log4j.Level;
16+
import org.apache.log4j.Logger;
17+
import org.apache.spark.SparkConf;
18+
import org.apache.spark.api.java.JavaPairRDD;
19+
import org.apache.spark.api.java.JavaRDD;
20+
import org.apache.spark.api.java.function.FlatMapFunction;
21+
import org.apache.spark.api.java.function.Function;
22+
import org.apache.spark.api.java.function.Function2;
23+
import org.apache.spark.api.java.function.PairFunction;
24+
import org.apache.spark.api.java.function.VoidFunction;
25+
import org.apache.spark.streaming.Durations;
26+
import org.apache.spark.streaming.api.java.JavaDStream;
27+
import org.apache.spark.streaming.api.java.JavaInputDStream;
28+
import org.apache.spark.streaming.api.java.JavaPairDStream;
29+
import org.apache.spark.streaming.api.java.JavaStreamingContext;
30+
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
31+
import org.apache.spark.streaming.kafka010.KafkaUtils;
32+
import org.apache.spark.streaming.kafka010.LocationStrategies;
33+
34+
import scala.Tuple2;
35+
36+
public class WordCountingApp {
37+
38+
@SuppressWarnings("serial")
39+
public static void main(String[] args) throws InterruptedException {
40+
Logger.getLogger("org")
41+
.setLevel(Level.OFF);
42+
Logger.getLogger("akka")
43+
.setLevel(Level.OFF);
44+
45+
Map<String, Object> kafkaParams = new HashMap<>();
46+
kafkaParams.put("bootstrap.servers", "localhost:9092");
47+
kafkaParams.put("key.deserializer", StringDeserializer.class);
48+
kafkaParams.put("value.deserializer", StringDeserializer.class);
49+
kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
50+
kafkaParams.put("auto.offset.reset", "latest");
51+
kafkaParams.put("enable.auto.commit", false);
52+
53+
Collection<String> topics = Arrays.asList("messages");
54+
55+
SparkConf sparkConf = new SparkConf();
56+
sparkConf.setMaster("local[2]");
57+
sparkConf.setAppName("WordCountingApp");
58+
sparkConf.set("spark.cassandra.connection.host", "127.0.0.1");
59+
60+
JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
61+
62+
JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams));
63+
64+
JavaPairDStream<String, String> results = messages.mapToPair(new PairFunction<ConsumerRecord<String, String>, String, String>() {
65+
@Override
66+
public Tuple2<String, String> call(ConsumerRecord<String, String> record) {
67+
return new Tuple2<>(record.key(), record.value());
68+
}
69+
});
70+
71+
JavaDStream<String> lines = results.map(new Function<Tuple2<String, String>, String>() {
72+
@Override
73+
public String call(Tuple2<String, String> tuple2) {
74+
return tuple2._2();
75+
}
76+
});
77+
78+
JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
79+
@Override
80+
public Iterator<String> call(String x) {
81+
return Arrays.asList(x.split("\\s+"))
82+
.iterator();
83+
}
84+
});
85+
86+
JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
87+
@Override
88+
public Tuple2<String, Integer> call(String s) {
89+
return new Tuple2<>(s, 1);
90+
}
91+
})
92+
.reduceByKey(new Function2<Integer, Integer, Integer>() {
93+
@Override
94+
public Integer call(Integer i1, Integer i2) {
95+
return i1 + i2;
96+
}
97+
});
98+
99+
wordCounts.foreachRDD(new VoidFunction<JavaPairRDD<String, Integer>>() {
100+
@Override
101+
public void call(JavaPairRDD<String, Integer> javaRdd) throws Exception {
102+
Map<String, Integer> wordCountMap = javaRdd.collectAsMap();
103+
for (String key : wordCountMap.keySet()) {
104+
List<Word> words = Arrays.asList(new Word(key, wordCountMap.get(key)));
105+
JavaRDD<Word> rdd = streamingContext.sparkContext()
106+
.parallelize(words);
107+
javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class))
108+
.saveToCassandra();
109+
}
110+
}
111+
});
112+
113+
streamingContext.start();
114+
streamingContext.awaitTermination();
115+
}
116+
}
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
package com.baeldung.data.pipeline;
2+
3+
import static com.datastax.spark.connector.japi.CassandraJavaUtil.javaFunctions;
4+
import static com.datastax.spark.connector.japi.CassandraJavaUtil.mapToRow;
5+
6+
import java.util.Arrays;
7+
import java.util.Collection;
8+
import java.util.HashMap;
9+
import java.util.Iterator;
10+
import java.util.List;
11+
import java.util.Map;
12+
13+
import org.apache.kafka.clients.consumer.ConsumerRecord;
14+
import org.apache.kafka.common.serialization.StringDeserializer;
15+
import org.apache.log4j.Level;
16+
import org.apache.log4j.Logger;
17+
import org.apache.spark.SparkConf;
18+
import org.apache.spark.api.java.JavaPairRDD;
19+
import org.apache.spark.api.java.JavaRDD;
20+
import org.apache.spark.api.java.JavaSparkContext;
21+
import org.apache.spark.api.java.Optional;
22+
import org.apache.spark.api.java.function.FlatMapFunction;
23+
import org.apache.spark.api.java.function.Function;
24+
import org.apache.spark.api.java.function.Function2;
25+
import org.apache.spark.api.java.function.Function3;
26+
import org.apache.spark.api.java.function.PairFunction;
27+
import org.apache.spark.api.java.function.VoidFunction;
28+
import org.apache.spark.streaming.Durations;
29+
import org.apache.spark.streaming.State;
30+
import org.apache.spark.streaming.StateSpec;
31+
import org.apache.spark.streaming.api.java.JavaDStream;
32+
import org.apache.spark.streaming.api.java.JavaInputDStream;
33+
import org.apache.spark.streaming.api.java.JavaMapWithStateDStream;
34+
import org.apache.spark.streaming.api.java.JavaPairDStream;
35+
import org.apache.spark.streaming.api.java.JavaStreamingContext;
36+
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
37+
import org.apache.spark.streaming.kafka010.KafkaUtils;
38+
import org.apache.spark.streaming.kafka010.LocationStrategies;
39+
40+
import scala.Tuple2;
41+
42+
public class WordCountingAppWithCheckpoint {
43+
44+
public static JavaSparkContext sparkContext;
45+
46+
@SuppressWarnings("serial")
47+
public static void main(String[] args) throws InterruptedException {
48+
49+
Logger.getLogger("org")
50+
.setLevel(Level.OFF);
51+
Logger.getLogger("akka")
52+
.setLevel(Level.OFF);
53+
54+
Map<String, Object> kafkaParams = new HashMap<>();
55+
kafkaParams.put("bootstrap.servers", "localhost:9092");
56+
kafkaParams.put("key.deserializer", StringDeserializer.class);
57+
kafkaParams.put("value.deserializer", StringDeserializer.class);
58+
kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
59+
kafkaParams.put("auto.offset.reset", "latest");
60+
kafkaParams.put("enable.auto.commit", false);
61+
62+
Collection<String> topics = Arrays.asList("messages");
63+
64+
SparkConf sparkConf = new SparkConf();
65+
sparkConf.setMaster("local[2]");
66+
sparkConf.setAppName("WordCountingAppWithCheckpoint");
67+
sparkConf.set("spark.cassandra.connection.host", "127.0.0.1");
68+
69+
JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
70+
71+
sparkContext = streamingContext.sparkContext();
72+
73+
streamingContext.checkpoint("./.checkpoint");
74+
75+
JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams));
76+
77+
JavaPairDStream<String, String> results = messages.mapToPair(new PairFunction<ConsumerRecord<String, String>, String, String>() {
78+
@Override
79+
public Tuple2<String, String> call(ConsumerRecord<String, String> record) {
80+
return new Tuple2<>(record.key(), record.value());
81+
}
82+
});
83+
84+
JavaDStream<String> lines = results.map(new Function<Tuple2<String, String>, String>() {
85+
@Override
86+
public String call(Tuple2<String, String> tuple2) {
87+
return tuple2._2();
88+
}
89+
});
90+
91+
JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
92+
@Override
93+
public Iterator<String> call(String x) {
94+
return Arrays.asList(x.split("\\s+"))
95+
.iterator();
96+
}
97+
});
98+
99+
JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
100+
@Override
101+
public Tuple2<String, Integer> call(String s) {
102+
return new Tuple2<>(s, 1);
103+
}
104+
})
105+
.reduceByKey(new Function2<Integer, Integer, Integer>() {
106+
@Override
107+
public Integer call(Integer i1, Integer i2) {
108+
return i1 + i2;
109+
}
110+
});
111+
112+
Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = (word, one, state) -> {
113+
int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
114+
Tuple2<String, Integer> output = new Tuple2<>(word, sum);
115+
state.update(sum);
116+
return output;
117+
};
118+
119+
JavaPairRDD<String, Integer> initialRDD = JavaPairRDD.fromJavaRDD(sparkContext.emptyRDD());
120+
121+
JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> cumulativeWordCounts = wordCounts.mapWithState(StateSpec.function(mappingFunc)
122+
.initialState(initialRDD));
123+
124+
cumulativeWordCounts.foreachRDD(new VoidFunction<JavaRDD<Tuple2<String, Integer>>>() {
125+
@Override
126+
public void call(JavaRDD<Tuple2<String, Integer>> javaRdd) throws Exception {
127+
List<Tuple2<String, Integer>> wordCountList = javaRdd.collect();
128+
for (Tuple2<String, Integer> tuple : wordCountList) {
129+
List<Word> words = Arrays.asList(new Word(tuple._1, tuple._2));
130+
JavaRDD<Word> rdd = sparkContext.parallelize(words);
131+
javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class))
132+
.saveToCassandra();
133+
}
134+
}
135+
});
136+
137+
streamingContext.start();
138+
streamingContext.awaitTermination();
139+
}
140+
}

0 commit comments

Comments
 (0)