Spark Streaming(4):Spark updateStateByKey in Java
博客专区 > Joe_Wu 的博客 > 博客详情
Spark Streaming(4):Spark updateStateByKey in Java
Joe_Wu 发表于9个月前
Spark Streaming(4):Spark updateStateByKey in Java
  • 发表于 9个月前
  • 阅读 13
  • 收藏 0
  • 点赞 0
  • 评论 0

移动开发云端新模式探索实践 >>>   

package com.pyrrha.examples;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import kafka.serializer.StringDecoder;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;

import scala.Tuple2;

public class updateStateByKey {
	
	private static final String KAFKA_TOPIC = "TopicA";
	
	public static void main(String[] args) throws Exception {
		System.setProperty("hadoop.home.dir", "D:\\checkpoint\\hadoop-common-2.2.0-bin-master");
		SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("WordsCount");
		JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.milliseconds(2000));

		Map<String, String> kafkaParams = new HashMap<String, String>();
		kafkaParams.put("bootstrap.servers", "127.0.0.1:9092");
		kafkaParams.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
		kafkaParams.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
		kafkaParams.put("group.id", "lingroup");
		//kafkaParams.put("auto.offset.reset", "latest");
		
		Set<String> topics = new HashSet<String>();
        topics.add(KAFKA_TOPIC);
		
        JavaPairInputDStream<String, String> stream = org.apache.spark.streaming.kafka.KafkaUtils.
			createDirectStream(jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);
		
        JavaPairDStream<String, Integer> transDStream = stream.flatMap(new FlatMapFunction<Tuple2<String,String>, String>() {
			public Iterator<String> call(Tuple2<String, String> t) throws Exception {
				return Arrays.asList(t._2.split(" ")).iterator();
			}
		}).filter(new Function<String, Boolean>() {
			public Boolean call(String v1) throws Exception {
				return v1.equals("a") ? false :true;
			}
		}).mapToPair(new PairFunction<String, String, Integer>() {
			public Tuple2<String, Integer> call(String t) throws Exception {
				return new Tuple2<String, Integer>(t, 1);
			}
		}).reduceByKey(new Function2<Integer, Integer, Integer>() {
			public Integer call(Integer v1, Integer v2) throws Exception {
				return v1 + v2;
			}
		}).window(new Duration(6000), new Duration(6000));
		
        /**
         * @param v1表示从上面获取的transDStream的所有value的List集合(如果transDStream=["b":2,"c":1],那么v1=[2,1])
         * @param v2表示上一个updateStateByKey的每个value保存的值
         * 所以,v1就是每次最新batch处理后的value集合,v2就是上个batch处理后缓存的value值
         */
        transDStream.updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
			public Optional<Integer> call(List<Integer> v1, Optional<Integer> v2) throws Exception {
				Integer v3 = 0;
				if(v2.isPresent())
					v3 = v2.get();
				for (Integer v : v1)
					v3 += v;
				
				return Optional.of(v3);
			}
		}).print();
        
		jssc.checkpoint("file:///D:/checkpoint/");
		jssc.start();
		jssc.awaitTermination();
	}
	
	

}

 

标签: Spark Streaming
  • 打赏
  • 点赞
  • 收藏
  • 分享
共有 人打赏支持
粉丝 1
博文 9
码字总数 6567
×
Joe_Wu
如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!
* 金额(元)
¥1 ¥5 ¥10 ¥20 其他金额
打赏人
留言
* 支付类型
微信扫码支付
打赏金额:
已支付成功
打赏金额: