-
数据准备
我们要把本地的1.csv文件读到Spark中
-
CsvDemo.scala
object CsvDemo { def main(args: Array[String]): Unit = { //1.创建Spark对象 val spark: SparkSession = SparkSession.builder() .appName("data") .master("local[*]") .getOrCreate() spark.sparkContext.setLogLevel("warn") //2.读取CSV数据源 val csvDf: DataFrame = spark.read .option("header", "true") //表示第一行是列的名称 .option("mode", "PERMISSIVE") //写文件时模式指定为覆盖写 .option("inferSchema", "true") //读取文件时是否自动判断列类型 .csv("/Users/xiaoyueyue/csdn/data/1.csv") //本地文件路径 //3.DataFrame注册成表 csvDf.createTempView("csv_data") //4.查询部分字段信息 spark.sql("select xingming,zhuanye from csv_data").show(10) } }
-
查询结果截图
-
原始数据截图
-
将查询结果写出成csv文件
spark.sql("select xingming,zhuanye from csv_data").write.csv("/Users/xiaoyueyue/Desktop/result")
/**
* 完整代码
*/
object CsvDemo {
def main(args: Array[String]): Unit = {
//1.创建Spark对象
val spark: SparkSession = SparkSession.builder()
.appName("data")
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("warn")
//2.读取CSV数据源
val csvDf: DataFrame = spark.read
.option("header", "true") //表示第一行是列的名称
.option("mode", "PERMISSIVE") //写文件时模式指定为覆盖写
.option("inferSchema", "true") //读取文件时是否自动判断列类型
.csv("/Users/xiaoyueyue/csdn/data/1.csv") //本地文件路径
//3.DataFrame注册成表
csvDf.createTempView("csv_data")
//4.查询部分字段信息
spark.sql("select xingming,zhuanye from csv_data").write.csv("/Users/xiaoyueyue/Desktop/result")
//5.关闭
spark.stop()
}
}
POM文件
(注意版本兼容问题)
<properties>
<spark.version>2.2.1</spark.version>
<scala.version>2.11</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-compiler</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>utf-8</encoding>
</configuration>
</plugin>
</plugins>
</build>
春江花朝秋月夜,往往取酒还独倾