1.文本转dataframe
//加入spark隐士转换
import spark.implicits._
def textData2DataFrame(): Unit ={
val text:List[(String, String, String)]=Source.fromFile("C:\\Users\\91BGJK2\\Desktop\\天府新区.txt","utf-8").getLines().map{ line=>
val strings = line.split("\t")
(strings(0),strings(1),strings(2))//通过构造元组进行转换
}.toList
//需要加入spark隐士转换,才能使用toDF强转
val testDataFram=text.toDF("id","name_patient","birthday")
testDataFram.select("name_patient").show(10)
//第二种--通过构造schema
val textRdd:RDD[Row]=spark.read.text("file:///C:\\Users\\91BGJK2\\Desktop\\天府新区.txt").rdd
val textRdd2:RDD[String]=spark.sparkContext.textFile("file:///C:\\Users\\91BGJK2\\Desktop\\天府新区.txt")
val rowRDD:RDD[Row] = textRdd2.map(_.split("\t").map(_.toString)).map(p => Row(p: _*))
val colNames=List("id","name_patient","birthday")
val schema:StructType = StructType(colNames.map(fieldName => StructField(fieldName,StringType)))//统一转换成string格式
// 将数据和结构合成,创建为DataFrame
val data = spark.createDataFrame(rowRDD, schema)
data.select("name_patient").show(10)
//第三中:使用csv封装的读取器
val text2:DataFrame=spark.read.format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
.option("header", true)
.option("delimiter", "\t")//指定定界符
.load("file:///C:\\Users\\91BGJK2\\Desktop\\天府新区.txt")
text2.select("name_patient").show(10)
}
2.dataframe的一些函数
val t_kc21k1=spark.read.format("jdbc")
.options(properties).option("dbtable","t_kc21k1").load()
--where
t_kc21k1.where($"fprn" === "1" && $"ftimes" === 2).show()
--order by
t_kc21k1.sort(asc("id")).show(10)
--group by
//grouby必须结合聚合函数使用才有效
t_kc21k1.groupBy("code","name").count().where($"count"===591).show()
//groupBy之后,对每组数据进行聚合,一些聚合操作包括max, min, count等。结合包括了column1及agg中指定的各列
t_kc21k1.groupBy("id_depa").agg(min("money_sum")as "minFee",max("money_sum") as "maxFee" ,count("ack190") as "timeNums").show(10)
--join
val rw_mapping = spark.read.format("jdbc")
.options(properties).option("dbtable", "rw_mapping").load()
//使用Seq()剔除重复列
t_kc21k1.join(rw_mapping,Seq("id_drg", "id_drg"),"inner").select("id_drg","rw").show(10)
--filter
t_kc21k1.filter($"id_drg"==="914").join(rw_mapping,Seq("id_drg","id_drg"),"inner").select("id_drg","rw").show(10)
3.dataFrame字段拆分--多行
val t_kc21k1=spark.read.format("jdbc")
.options(properties).option("dbtable","t_kc21k1").load()
//方法一、实现字段拆分为多行----使用rdd中的flatmap+toDf
val sdxFrame=t_kc21k1.select("zdbm","zdmc").rdd.filter(str=>str.get(0)!=null).flatMap(str=>delemiterName(str.getAs[String]("zdbm"),str.getAs[String]("zdmc"))).toDF("code","name")
.write.mode(SaveMode.Overwrite)
.format("jdbc")
.options(properties)
.option("dbtable","t_zd_statistic")
.save()
def delemiterName(code:String,name:String) ={
val codes = code.split("\\|")
val names=name.split("\\|")
val tuples:Array[(String, String)] = codes.zip(names)
tuples
}
---也可以不用转rdd,直接dataframe操作,再重新复制字段名称
t_kc21k1.select("id","ssmc").where($"id"===137).flatMap(str=>delemiterName(str.getAs[String]("ssmc"),str.getAs[String]("ssmc")))
.toDF("ssmc1","ssmc2")
.show(10)
//方法二--使用spark sql中的explode函数---配合自定义函数udf
spark.udf.register("testArr",(x:String)=>testArr(x))
spark.sql("select id,explode(testArr(ssmc)) from t_kc21k1 where ssbm is not null").show(10)
def testArr(code:String)={
val res=code.split("\\|")
res
}
//方法三---直接使用dataframe中的functions.expolde()
t_kc21k1.select("id","ssmc").where($"id"===137).explode( "ssmc" , "ssmc_new" ){time: String => time.split("\\|")}.show(10)
//备注:explode已经弃用了,现在使用functions.expolde()。
Dataset and DataFrame API explode has been deprecated, alternatively, use functions.explode() with select or flatMap。
前提是该字段为Array或Map数据类型。如果满足上述类型,需要进行转换---先转换为Array再进行操作。
t_kc21k1.select("id","ssmc").where($"id"===137).select(functions.split($"ssmc","\\|") as "ssmc")
.select(functions.explode($"ssmc")).show(10)
---dataframe的过滤null值 filter
t_kc21k1.filter("ssmc is not null").show(10)
t_kc21k1.filter($"ssmc" isNull).show(10)
4.dataframe字段拆分为多列
t_kc21k1.select("id","ssmc").where($"id"===5697).select(functions.split($"ssmc","\\|") as "ssmc")
.withColumn("ssmc1",$"ssmc".getItem(0))
.withColumn("ssmc2",$"ssmc".getItem(1))
.withColumn("ssmc3",$"ssmc".getItem(2))
.withColumn("ssmc4",$"ssmc".getItem(3))
.withColumn("ssmc5",$"ssmc".getItem(4))
.withColumn("ssmc6",$"ssmc".getItem(5))
.withColumn("ssmc7",$"ssmc".getItem(6))
.show()
//如果知道该字段最大可拆分为多少列,可以通过withColumn提前指定每列的字段名,若该列没有值默认回使用null值填充。
本次知道手术一般为7个,所以提前指定7个字段。