json在读取数据的时候会读取schema描述信息,读取全部数据,才能确定类型
写出文件二种方式
//df2.write.mode(SaveMode.Append).json("out/boy")
df2.write.mode(SaveMode.Append).format("json").save("out2/boy")
读取文件二种方式
//获取数据的schema信息,每一行都有读取,将描述信息返回Driver端了
//val df: DataFrame = spark.read.json("data/user.json")
val df: DataFrame = spark.read.format("json").load("data/user.json")
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DoubleType, IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
object TXTChangeJSON {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()
val sc = spark.sparkContext
val lines = sc.textFile("data\\user.txt")
val rowRDD: RDD[Row] = lines.map(it => {
val strings = it.split(",")
val f1 = strings(0).toInt
val f2 = strings(1)
val f3 = strings(2).toInt
val f4 = strings(3)
val f5 = strings(4).toDouble
Row(f1, f2, f3, f4, f5)
})
val schema: StructType = StructType(List(
StructField("id",IntegerType),
StructField("name",StringType),
StructField("age",IntegerType),
StructField("province",StringType),
StructField("score",DoubleType),
))
val df: DataFrame = spark.createDataFrame(rowRDD, schema)
//创建DataFrame.将txt文件写为json文件,.
//mode(SaveMode.Overwrite)将原文件删除,然后重写
//mode(SaveMode.Append)//追加文件
//mode(SaveMode.ErrorIfExists)//有重复的就报错
//mode(SaveMode.Ignore)//最佛系的,有重复不报错也不重写追加
df.write.mode(SaveMode.Overwrite).json("bigdata\\user.json")
df.printSchema()
//导入隐式转换,否则没有 $
import spark.implicits._
//过滤筛选自己想要的结果
df.filter(row=>{
row.getString(1) != null
}).select("name","age").orderBy($"age" desc ).show()
spark.stop()
}
}
json将描述信息返回到Driver端,它是最特殊的,它读取每一行数据,才能确定属性
读取json,过滤掉脏数据
object CreateDataFrameFromJSON {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()
val df = spark.read.json("bigdata\\user.json")
df.printSchema()
/* import spark.implicits._
df.where($"_corrupt_record" isNull).show()*/
df.filter(it=>{
it.getString(0)==null
}).select("name","age").show()
spark.stop()
}
}