package com.lyzx.reviewDay30 import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.{SparkConf, SparkContext} class T1 { /** *读取文件保存为parquet格式 * @param sc */ def f1(sc:SparkContext): Unit ={ import org.apache.spark.sql.types._ val sqlCtx = new SQLContext(sc) // 88,朱德,45 // 89,毛泽东,46 // 90,邓小平,56 val rowRdd = sc.textFile("./Person.txt") .map(_.split(",")) .map(x=>Row(x(0).toInt,x(1),x(2).toInt)) val schema = StructType( Array( StructField("id",IntegerType,false), StructField("name",StringType,true), StructField("age",IntegerType,false) ) ) val df = sqlCtx.createDataFrame(rowRdd,schema) df.registerTempTable("user") df.printSchema() val resultDf = sqlCtx.sql("select * from user where age > 45") resultDf.foreach(x=>{ val id = x.getAs[Integer]("id") val name = x.getAs[String]("name") val age = x.getAs[Integer]("age") println("id="+id+",name="+name+",age="+age) }) resultDf.write.parquet("./Person") } /** * 读取parquet格式的文件 * @param sc */ def f2(sc:SparkContext): Unit ={ val sqlCtx = new SQLContext(sc) val df = sqlCtx.read.parquet("./Person.parquet") df.printSchema() df.foreach(println) } /** * 简单的案例 * 2017-11-12,270,11733 * 统计每个月的的销售额要带有数据清洗功能 */ def f3(sc:SparkContext): Unit ={ val sqlCtx = new SQLContext(sc) val data = Array("2017-11-11,200,11333","2017-11-11,260,11533","2017-11-11,190,11534", "2017-11-12,120,11534","2017-11-12,890,22343","2017-11-12,260,11535", "2017-11-13,120,11534","2017-11-13,890,22343","2017-11-13,260,11535") val oData = sc.parallelize(data) .filter(x=>x.split(",").length ==3) .map(_.split(",")) .map(x=>Row(x(0),x(1).toInt,x(2))) val schema = StructType( Array( StructField("now_date",StringType,false), StructField("sales",IntegerType,false), StructField("name",StringType,false) ) ) val df = sqlCtx.createDataFrame(oData,schema) df.registerTempTable("score") val result = sqlCtx.sql("select now_date,sum(sales) from score group by now_date") result.show() } /** * UDF * User Defined Function */ def f4(sc:SparkContext):Unit ={ val sqlCtx = new SQLContext(sc) val dataArr = Array("周星驰","大话西游","罗密欧与朱丽叶","水遁*大鲛弹","无敌是多么,多么..寂寞") val dataRDD = sc.parallelize(dataArr).map(Row(_)) val schema = StructType( Array(StructField("name",StringType,false)) ) val df = sqlCtx.createDataFrame(dataRDD,schema) df.registerTempTable("names") sqlCtx.udf.register("strLen",(str:String)=>{str.length}) sqlCtx.sql("select name,strLen(name) as len from names").show() } /** * UDAF */ def f5(sc:SparkContext): Unit ={ val sqlContext = new SQLContext(sc) val names = Array("yarn","Marry","Jack","Tom","Tom","Tom","Tom","Tom") /** * map task1 : "Tom", "Tom", "Tom" ~Tom~Tom~Tom * map task2 : ~Jack * reduce task1 : ~Tom~Tom~Tom * reduce task2 : ~Jack */ val namesRDD = sc.parallelize(names) val namesRowRDD = namesRDD.map { name => Row(name) } val structType = StructType(Array(StructField("name", StringType, true))) val namesDF = sqlContext.createDataFrame(namesRowRDD, structType) namesDF.registerTempTable("names") sqlContext.udf.register("strCount", new StringCount) // 使用自定义函数 sqlContext.sql("select name,strCount(name) from names group by name") .collect() .foreach(println) } def f6(sc:SparkContext): Unit ={ val sqlCtx = new SQLContext(sc) val data = sc.parallelize(Array("A","B","C","A","B","C","A","B","C","D","E","A","B","C","A","B","C","A","B","C"),2) .map(Row(_)) data.mapPartitionsWithIndex((index,itr)=>{ println("index="+index) for(v <- itr){ print(" "+v) } for(v <- itr) yield v }).collect() val schema = StructType(Array(StructField("w",StringType,false))) val df = sqlCtx.createDataFrame(data,schema) df.registerTempTable("words") sqlCtx.udf.register("myCount",new MyUDAF) sqlCtx.sql("select w,myCount(w) from words group by w").show() } } object T1{ def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("reviewDay30").setMaster("local") val sc = new SparkContext(conf) val t = new T1 t.f1(sc) // t.f2(sc) // t.f3(sc) // t.f4(sc) // t.f5(sc) // t.f6(sc) sc.stop() } }
package com.lyzx.reviewDay30 import org.apache.spark.sql.Row import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ /** * 这个聚合函数实现count函数的功能 */ class MyUDAF extends UserDefinedAggregateFunction{ //函数的输入参数类型 override def inputSchema: StructType = { StructType(Array(StructField("123",StringType,true))) } //聚合过程中的中间结果集类型 override def bufferSchema: StructType = { StructType(Array(StructField("123",IntegerType,true))) } override def dataType: DataType = { IntegerType } override def deterministic: Boolean = { true } //初始值 override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = 0 } //相当于map的combiner,buffer里面存放着累计的执行结果,input是当前的执行结果 override def update(buffer:MutableAggregationBuffer, input: Row): Unit = { buffer(0)=buffer.getAs[Integer](0) + 1 println("update:buffer="+buffer+".....,input="+input) } //相当于reduce端的合并 override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0)=buffer1.getAs[Integer](0)+buffer2.getAs[Integer](0) println("merge:buffer1="+buffer1+",,,,,,buffer2="+buffer2) } override def evaluate(buffer: Row): Any = { buffer.getAs(0) } }
本文介绍如何使用 Apache Spark SQL 进行数据处理与分析,包括从文本文件读取数据并转换为 Parquet 格式,利用自定义函数进行数据清洗及聚合操作等高级应用。
386

被折叠的 条评论
为什么被折叠?



