Spark Streaming物品排名

最新推荐文章于 2024-12-30 19:25:17 发布

原创最新推荐文章于 2024-12-30 19:25:17 发布 · 454 阅读

2 ·

CC 4.0 BY-SA版权

Spark Streaming 专栏收录该内容

9 篇文章

订阅专栏

利用Spark Streaming与Spark SQL实现实时统计各商品类别中最受欢迎的商品，每10秒更新一次过去60秒内点击次数最多的前五名商品。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Scala:

package cn.spark.study.streaming

import org.apache.spark.SparkConf
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.hive.HiveContext

object Top5HotProduct {
  
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
        .setMaster("local[2]")  
        .setAppName("Top5HotProduct")
    val ssc = new StreamingContext(conf, Seconds(1)) // 1秒中计算一次数据
    
    val productClickLogsDStream = ssc.socketTextStream("spark1", 7777)
    // 输入数据格式：姓名 商品 类别
    // 将数据转换为 (类别_商品, 1)的格式，便于以“类别_商品”为key进行聚合统计次数
    val categoryProductPairsDStream = productClickLogsDStream
        .map { productClickLog => (productClickLog.split(" ")(2) + "_" + productClickLog.split(" ")(1), 1)}
    // 每隔10秒统计最近60秒的点击次数
    val categoryProductCountsDStream = categoryProductPairsDStream.reduceByKeyAndWindow(
        (v1: Int, v2: Int) => v1 + v2, 
        Seconds(60), 
        Seconds(10))  
    
    categoryProductCountsDStream.foreachRDD(categoryProductCountsRDD => {
      val categoryProductCountRowRDD = categoryProductCountsRDD.map(tuple => {
        val category = tuple._1.split("_")(0)
        val product = tuple._1.split("_")(1)  
        val count = tuple._2
        Row(category, product, count)  // 将数据转换为 (类别，商品，数量)的格式，便于后续使用sql查询
      })
      
      val structType = StructType(Array(
          StructField("category", StringType, true),
          StructField("product", StringType, true),
          StructField("click_count", IntegerType, true)))
          
      val hiveContext = new HiveContext(categoryProductCountsRDD.context)
      
      val categoryProductCountDF = hiveContext.createDataFrame(categoryProductCountRowRDD, structType)  //构建DF
      
      categoryProductCountDF.registerTempTable("product_click_log")  // 注册为临时表
      
      val top3ProductDF = hiveContext.sql(
            "SELECT category,product,click_count "
            + "FROM ("
              + "SELECT "
                + "category,"
                + "product,"
                + "click_count,"
                + "row_number() OVER (PARTITION BY category ORDER BY click_count DESC) rank " // 按类别分组排序
              + "FROM product_click_log"  
            + ") tmp "
            + "WHERE rank<=5")
            
      top3ProductDF.show()
    })
    
    ssc.start()
    ssc.awaitTermination()
  }
  
}

Java:

package cn.spark.study.streaming;

import java.util.ArrayList;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;

import scala.Tuple2;

/**
 * 与Spark SQL整合使用，top3热门商品实时统计
 * @author Administrator
 *
 */
public class Top5HotProduct {

	public static void main(String[] args) {
		SparkConf conf = new SparkConf()
				.setMaster("local[2]")
				.setAppName("Top5HotProduct");  
		JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(1));
		
		JavaReceiverInputDStream<String> productClickLogsDStream = jssc.socketTextStream("spark1", 9999);
		
		JavaPairDStream<String, Integer> categoryProductPairsDStream = productClickLogsDStream
				.mapToPair(new PairFunction<String, String, Integer>() {

					private static final long serialVersionUID = 1L;

					@Override
					public Tuple2<String, Integer> call(String productClickLog)
							throws Exception {
						String[] productClickLogSplited = productClickLog.split(" "); 
						return new Tuple2<String, Integer>(productClickLogSplited[2] + "_" + 
								productClickLogSplited[1], 1);
					}
					
				});
		

		JavaPairDStream<String, Integer> categoryProductCountsDStream = 
				categoryProductPairsDStream.reduceByKeyAndWindow(
						
						new Function2<Integer, Integer, Integer>() {

							private static final long serialVersionUID = 1L;
				
							@Override
							public Integer call(Integer v1, Integer v2) throws Exception {
								return v1 + v2;
							}
							
						}, Durations.seconds(60), Durations.seconds(10));  
		

		categoryProductCountsDStream.foreachRDD(new Function<JavaPairRDD<String,Integer>, Void>() {
			
			private static final long serialVersionUID = 1L;

			@Override
			public Void call(JavaPairRDD<String, Integer> categoryProductCountsRDD) throws Exception {
				// 将该RDD，转换为JavaRDD<Row>的格式
				JavaRDD<Row> categoryProductCountRowRDD = categoryProductCountsRDD.map(
						
						new Function<Tuple2<String,Integer>, Row>() {

							private static final long serialVersionUID = 1L;

							@Override
							public Row call(Tuple2<String, Integer> categoryProductCount)
									throws Exception {
								String category = categoryProductCount._1.split("_")[0];
								String product = categoryProductCount._1.split("_")[1];
								Integer count = categoryProductCount._2;
								return RowFactory.create(category, product, count);   
							}
							
						});
				
				// 然后，执行DataFrame转换
				List<StructField> structFields = new ArrayList<StructField>();
				structFields.add(DataTypes.createStructField("category", DataTypes.StringType, true)); 
				structFields.add(DataTypes.createStructField("product", DataTypes.StringType, true));  
				structFields.add(DataTypes.createStructField("click_count", DataTypes.IntegerType, true));  
				StructType structType = DataTypes.createStructType(structFields);
				
				HiveContext hiveContext = new HiveContext(categoryProductCountsRDD.context());
				
				DataFrame categoryProductCountDF = hiveContext.createDataFrame(
						categoryProductCountRowRDD, structType);
				
				// 将60秒内的每个种类的每个商品的点击次数的数据，注册为一个临时表
				categoryProductCountDF.registerTempTable("product_click_log");  
				
				// 执行SQL语句，针对临时表，统计出来每个种类下，点击次数排名前5的热门商品
				DataFrame top3ProductDF = hiveContext.sql(
						"SELECT category,product,click_count "
						+ "FROM ("
							+ "SELECT "
								+ "category,"
								+ "product,"
								+ "click_count,"
								+ "row_number() OVER (PARTITION BY category ORDER BY click_count DESC) rank "
							+ "FROM product_click_log"  
						+ ") tmp "
						+ "WHERE rank<=5");
				
				top3ProductDF.show();      
				
				return null;
			}
			
		});
		
		jssc.start();
		jssc.awaitTermination();
		jssc.close();
	}
	
}