spark知识点总结一
数据库加载
实验数据库见:https://download.youkuaiyun.com/download/hiscoming/35151985
hive中数据库查看:
> show databases;
> use zhouwf;
> show tables;
导入priors表为列:
> create table zhouwf.priors(
> order_id string,
> product_id string,
> add_to_cart_order string,
> reordered string
> )
> row format delimited fields terminated by ','
> lines terminated by '\n';
//导入本地数据到表格
> load data local inpath '/home/zhouwf/Documents/data/order_products__prior.csv'
> overwrite into table priors;
实现效果如图:
spark中配置读取hive数据
要想在spark中读取都hive数据需要配置:
cp hive-site.xml /usr/local/src/spark-2.4.4-bin-hadoop2.6/conf/hive-site.xml
cp mysql-connector-java-5.1.46-bin.jar /usr/local/src/spark-2.4.4-bin-hadoop2.6/jars
项目需求实战
1.每个用户平均购买订单的间隔周期
scala> val orders = sql("select * from zhouwf.orders")
21/10/28 16:32:36 WARN metastore.ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
orders: org.apache.spark.sql.DataFrame = [order_id: string, user_id: string ... 5 more fields]
scala> val ordersNew = orders.selectExpr("*", "if(days_since_prior_order = '',0.0,days_since_prior_order)")
val ordersNew = orders.selectExpr("*", "if(days_since_prior_order = '',0.0,days_since_prior_order) as dspo").drop("days_since_prior_order")
ordersNew: org.apache.spark.sql.DataFrame = [order_id: string, user_id: string ... 5 more fields]
scala> ordersNew.show(3)
+--------+-------+--------+------------+---------+-----------------+--------------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day| dspo|
+--------+-------+--------+------------+---------+-----------------+--------------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_...|
| 2539329| 1| prior| 1| 2| 08| 0.0|
| 2398795| 1| prior| 2| 3| 07| 15.0|
+--------+-------+--------+------------+---------+-----------------+--------------------+
only showing top 3 rows
scala> val userGap = ordersNew.selectExpr("user_id","cast(dspo as int) as dspo").groupBy("user_id").avg("dspo")
userGap: org.apache.spark.sql.DataFrame = [user_id: string, avg(dspo): double]
scala> userGap.show(5)
+-------+------------------+
|user_id| avg(dspo)|
+-------+------------------+
| 296| 5.428571428571429|
| 467| 8.833333333333334|
| 675| 20.0|
| 691|13.173913043478262|
| 829| 9.25|
+-------+------------------+
only showing top 5 rows
2.每个用户的总订单数量(分组)
scala> val userTotalOrd = orders.groupBy("user_id").count()
userTotalOrd: org.apache.spark.sql.DataFrame = [user_id: string, count: bigint]
scala> userTotalOrd.show(5)
+-------+-----+
|user_id|count|
+-------+-----+
| 296| 7|
| 467| 6|
| 675| 11|
| 691| 23|
| 829| 4|
+-------+-----+
only showing top 5 rows
3.每个用户购买的product商品去重后的集合数据
scala> val priors = sql("select * from zhouwf.priors")
priors: org.apache.spark.sql.DataFrame = [order_id: string, product_id: string ... 2 more fields]
scala> val oPDF = orders.join(priors,"order_id")
oPDF: org.apache.spark.sql.DataFrame = [order_id: string, user_id: string ... 8 more fields]
scala> val oP = oPDF.selectExpr("user_id","product_id")
oP: org.apache.spark.sql.DataFrame = [user_id: string, product_id: string]
scala> import spark.implicits._
import spark.implicits._
scala> val rddRecords = oP.rdd.map{x=>(x(0).toString, x(1).toString)}.groupByKey().mapValues(record=>record.toSet.mkString(","))
rddRecords: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[66] at mapValues at <console>:28
scala> rddRecords.take(5)
[Stage 11:> (0 + 1) / 1]21/10/27 20:39:09 WARN executor.Executor: Managed memory leak detected; size = 5244782 bytes, TID = 214
res6: Array[(String, String)] = Array((124168,14303,22108,46522,20082,11323), (120928,8569,30252,47209,18897,19478,36695,39928,4799,13870,39275,45948,26324,27845,42265,10132,32931,45763,20995,7175,15902,6457,34991,41665,1185,5194,34448,29686,31433,30440,4605,39877,27966,18441,24799,39561,14633,30750,13629,25824,25072,2452,34270), (186692,44172,11759,11365,13176,4557,12078,6087,14678,5384,14161,18615,33720,6184,37710,21573,11266,48697,14032,4913,17634,35336,130,21572,4920), (43535,30787,12896,34122,46785,17794,47589,32538,10193,23691,13187,48205,7631,30183,15054,32971,44570,10554,6789,23375,9969,48071,34862,32747,25783,29515,19333,5077,6187,36030,33129,35750,325,45539,19505,12099,26668,29388,36023,8536,42580,9544,31743,10620,38763,33508,35163,14602,10492,34808,10447,42563,44177,4562,7175...
scala>
scala> rddRecords.toDF("user_id","product_id").show(5)
21/10/27 20:40:16 WARN executor.Executor: Managed memory leak detected; size = 5244782 bytes, TID = 215
+-------+--------------------+
|user_id| product_id|
+-------+--------------------+
| 124168|14303,22108,46522...|
| 120928|8569,30252,47209,...|
| 186692|44172,11759,11365...|
| 43535|30787,12896,34122...|
| 8965|1446,42205,25466,...|
+-------+--------------------+
only showing top 5 rows
4.每个用户总商品数量以及去重后的商品数量(distinct count)
val userGroup = oP.groupBy("user_id").agg(size(collect_set("product_id") as "prod_dist_size"),collect_set("product_id") as "prod_records")
userGroup: org.apache.spark.sql.DataFrame = [user_id: string, size(collect_set(product_id) AS `prod_dist_size`): int ... 1 more field]
scala> userGroup.show(5)
+-------+-------------------------------------------------+--------------------+
|user_id|size(collect_set(product_id) AS `prod_dist_size`)| prod_records|
+-------+-------------------------------------------------+--------------------+
| 100010| 119|[7751, 27360, 247...|
| 100140| 28|[7021, 27845, 436...|
| 100227| 70|[24834, 45007, 27...|
| 100263| 38|[5157, 38928, 372...|
| 100320| 121|[39891, 34358, 17...|
+-------+-------------------------------------------------+--------------------+
only showing top 5 rows
5.每个用户购买的平均每个订单的商品数量
scala> val userPerOrdPro = orders.join(ordProCnt,"order_id").groupBy("user_id").agg(avg("count").as("avg_per_order_pro"))
userPerOrdPro: org.apache.spark.sql.DataFrame = [user_id: string, avg_per_order_pro: double]
scala> userPerOrdPro.show(5)
+-------+-----------------+
|user_id|avg_per_order_pro|
+-------+-----------------+
| 88447| 7.4|
| 144913| 5.5|
| 145079| 8.5|
| 13282| 6.1875|
| 124057| 20.0|
+-------+-----------------+
only showing top 5 rows