Spark SQL 下DateFrame的初步认识（2）

最新推荐文章于 2024-07-31 15:54:40 发布

原创最新推荐文章于 2024-07-31 15:54:40 发布 · 1.2k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#spark #sql #DataFrame #Spark-SQL

Spark梦想专栏收录该内容

30 篇文章

订阅专栏

本文介绍如何使用Java和Scala进行RDD与DataFrame的转换及应用。包括Java中通过反射创建DataFrame并执行SQL查询，以及Scala环境下利用SparkShell执行SparkSQL命令的方法。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1,使用Java和Scala实战RDD和DataFrame

1,Java 方式实战RDD与DataFrame的转换

 import java.util.List;


import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;

public class RDD2DataFrameByReflect {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local").setAppName("RDD2DataFrameByReflect");
        JavaSparkContext sc = new JavaSparkContext(conf);

        SQLContext sqlContext = new SQLContext(sc);

        JavaRDD<String> lines = sc.textFile("C://Users//zpf//Desktop//Persons.txt");

        JavaRDD<Person> persons = lines.map(new Function<String, Person>() {

            @Override
            public Person call(String line) throws Exception {

                String[] splited = line.split(",");
                Person p = new Person();

                p.setId(Integer.valueOf(splited[0].trim()));
                p.setName(splited[1]);
                p.setAge(Integer.valueOf(splited[2].trim()));
                return p;
            }
        });

        DataFrame df = sqlContext.createDataFrame(persons, Person.class);

        df.registerTempTable("persons");

        DataFrame bigDatas = sqlContext.sql("select * from persons where age >= 6");

        //将DataFrame转化成RDD以方便后面输出数据使用
        JavaRDD<Row> bigDataRDD = bigDatas.javaRDD();

         JavaRDD<Person> result = bigDataRDD.map(new Function<Row, Person>() {

            @Override
            public Person call(Row row) throws Exception {
                Person p = new Person();
//DataFrame 转换成RDD时注意根据fieldName 获取对应的值  默认字母排序方式
                p.setId(row.getInt(1));
                p.setName(row.getString(2));

                p.setAge(row.getInt(0));

                return p;
            }
        });

        List<Person> PersonList =  result.collect();

        for(Person p : PersonList){
            System.out.println(p);
        }


    }
}




import scala.Serializable;

public class Person implements Serializable{

    private int id;
    private String name;
    private int age;

    public int getId() {
        return id;
    }
    public void setId(int id) {
        this.id = id;
    }
    public String getName() {
        return name;
    }
    public void setName(String name) {
        this.name = name;
    }
    public int getAge() {
        return age;
    }
    public void setAge(int age) {
        this.age = age;
    }
    @Override
    public String toString() {
        return "People [id=" + id + ", name=" + name + ", age=" + age + "]";
    }

}

**注意：JavaRDD和DataFrame互相转换注意事项：
1）反射的类必须是Pulic class
2）定义的类必须实现Serializable 接口
3）DataFrame 转换成RDD时注意根据fieldName 获取对应的值**

2，Scala运行实战

1，下述代码片段展示了可以在Spark Shell终端执行的Spark SQL命令


// 首先用已有的Spark Context对象创建SQLContext对象
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

// 导入语句，可以隐式地将RDD转化成DataFrame
import sqlContext.implicits._

// 创建一个表示客户的自定义类
case class Customer(customer_id: Int, name: String, city: String, state: String, zip_code: String)

// 用数据集文本文件创建一个Customer对象的DataFrame
val dfCustomers = sc.textFile("data/customers.txt").map(_.split(",")).map(p => Customer(p(0).trim.toInt, p(1), p(2), p(3), p(4))).toDF()

// 将DataFrame注册为一个表
dfCustomers.registerTempTable("customers")

// 显示DataFrame的内容
dfCustomers.show()

// 打印DF模式
dfCustomers.printSchema()

// 选择客户名称列
dfCustomers.select("name").show()

// 选择客户名称和城市列
dfCustomers.select("name", "city").show()

// 根据id选择客户
dfCustomers.filter(dfCustomers("customer_id").equalTo(500)).show()

// 根据邮政编码统计客户数量
dfCustomers.groupBy("zip_code").count().show()

在上一示例中，模式是通过反射而得来的。我们也可以通过编程的方式指定数据集的模式。这种方法在由于数据的结构以字符串的形式编码而无法提前定义定制类的情况下非常实用。

如下代码示例展示了如何使用新的数据类型类StructType，StringType和StructField指定模式。

// 用编程的方式指定模式


// 用已有的Spark Context对象创建SQLContext对象
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

// 创建RDD对象
val rddCustomers = sc.textFile("data/customers.txt")

// 用字符串编码模式
val schemaString = "customer_id name city state zip_code"

// 导入Spark SQL数据类型和Row
import org.apache.spark.sql._

import org.apache.spark.sql.types._;

// 用模式字符串生成模式对象
val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))

// 将RDD（rddCustomers）记录转化成Row。
val rowRDD = rddCustomers.map(_.split(",")).map(p => Row(p(0).trim,p(1),p(2),p(3),p(4)))

// 将模式应用于RDD对象。
val dfCustomers = sqlContext.createDataFrame(rowRDD, schema)

// 将DataFrame注册为表
dfCustomers.registerTempTable("customers")

// 用sqlContext对象提供的sql方法执行SQL语句。
val custNames = sqlContext.sql("SELECT name FROM customers")

// SQL查询的返回结果为DataFrame对象，支持所有通用的RDD操作。
// 可以按照顺序访问结果行的各个列。
custNames.map(t => "Name: " + t(0)).collect().foreach(println)

// 用sqlContext对象提供的sql方法执行SQL语句。
val customersByCity = sqlContext.sql("SELECT name,zip_code FROM customers ORDER BY zip_code")

// SQL查询的返回结果为DataFrame对象，支持所有通用的RDD操作。
// 可以按照顺序访问结果行的各个列。
customersByCity.map(t => t(0) + "," + t(1)).collect().foreach(println)