sparkSQL练习(反射,编程)

本文介绍如何使用Java和Scala通过反射及动态元数据指定,将RDD转换为DataFrame,以实现更高效的数据处理和SQL查询功能。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

使用反射的方式将RDD转换成DataFrame

java写

//student类
public class student {
    private String id;
    private String name;
    private String age;

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getAge() {
        return age;
    }

    public void setAge(String age) {
        this.age = age;
    }

    @Override
    public String toString() {
        return "student{" +
                "id='" + id + '\'' +
                ", name='" + name + '\'' +
                ", age='" + age + '\'' +
                '}';
    }
}

//主类
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.codehaus.jackson.map.ObjectMapper;

/**
 * 使用反射的方式将RDD转换成DataFrame
 */

public class Rdd2DataFrameRefection {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("Rdd2DataFrameRefection")
                .setMaster("local");

        JavaSparkContext sc = new JavaSparkContext(conf);

        SQLContext sqlContext = new SQLContext(sc);

        //{"id":1, "name":"leo", "age":18}
        JavaRDD<String> javaRDD = sc.textFile("D:\\eclipse\\wc\\scalaworid\\students(2).json");

        JavaRDD<student> students = javaRDD.map(new Function<String, student>() {
            @Override
            public student call(String s) throws Exception {
                ObjectMapper objectMapper = new ObjectMapper();
                student student = objectMapper.readValue(s, student.class);
                return student;
            }
        });
        //使用反射方式,将RDD转换成DataFrame
        //将Student.class传入进去,其实就是使用反射的方式来创建DataFrame
        //因为Student.class本身就是反射的一个应用
        //然后底层还得通过对Student Class进行反射,来获取其中的field
        DataFrame dataFrame = sqlContext.createDataFrame(students, student.class);

        //拿到了一个DataFrame之后,就可以注册一个临时表,然后针对其中的数据进行SQL查询
        dataFrame.registerTempTable("student");

        //针对students临时表执行sql语句
        //查询年龄小于18岁的学生,即使teenager

        String sql = "select id,name,age from student where age <=18 ";
        DataFrame sql1 = sqlContext.sql(sql);

        sql1.show();
        //结果
        //+---+-----+---+
        //| id| name|age|
        //+---+-----+---+
        //|  1|  leo| 18|
        //|  3|marry| 17|
        //+---+-----+---+
    }
}

scala写

import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}


/**
  * 如果要用scala开发spark程序
  * 然后在其中,还要实现基于反射的RDD到DataFrame的转换,就必须得用object extends App的方式
  * 不能用def main()方法的方式,来运行程序,否则就会报no typetag for ...class的错误
  *
  * @author Administrator
  */
object RDD2DataFrameReflection extends App {
  val conf: SparkConf = new SparkConf().setAppName("RDD2DataFrameReflection").setMaster("local")

  val sc = new SparkContext(conf)

  val sQLContext = new SQLContext(sc)

  // 在Scala中使用反射方式,进行RDD到DataFrame的转换,需要手动导入一个隐式转换
  import sQLContext.implicits._

  case class student(id:Int,name:String,age:Int)

  //1,leo,17
//  val stuString: RDD[String] = sc.textFile("D:\\eclipse\\wc\\scalaworid\\students.txt")
//
//  //1 leo 17
//  val stuArray: RDD[Array[String]] = stuString.map(_.split(","))
//
//  val students: RDD[student] = stuArray.map(arr =>student(arr(0).trim.toInt,arr(1),arr(2).trim.toInt))
//
//  val stuDF: DataFrame = students.toDF()

  //将上面的四行代码合并到一起
  val stuDF: DataFrame = sc.textFile("D:\\eclipse\\wc\\scalaworid\\students.txt")
    .map(_.split(",")).map(arr => student(arr(0).toInt, arr(1), arr(2).toInt))
    .toDF()



  //注册临时表
  stuDF.registerTempTable("sthdent")

  val dataFrame: DataFrame = sQLContext.sql("select id,name,age from sthdent where age <= 18")
dataFrame.show()
  //结果
  //+---+-----+---+
  //| id| name|age|
  //+---+-----+---+
  //|  1|  leo| 17|
  //|  2|marry| 17|
  //|  3| jack| 18|
  //+---+-----+---+


}

以编程方式动态指定元数据,将RDD转换成DataFrame

java写

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.ArrayList;

/**
 * 以编程方式动态指定元数据,将RDD转换成DataFrame
 */
public class RDD2DataFrameFromBC {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setMaster("local")
                .setAppName("RDD2DataFrameFromBC");

        final JavaSparkContext sc = new JavaSparkContext(conf);

        SQLContext sqlContext = new SQLContext(sc);

        //第一步,创建一个普通的RDD
        JavaRDD<String> lines = sc.textFile("D:\\eclipse\\wc\\scalaworid\\students.txt");

        JavaRDD<Row> rowJavaRDD = lines.map(new Function<String, Row>() {
            @Override
            public Row call(String line) throws Exception {
                String[] split = line.split(",");
                return RowFactory.create(
                        split[0],
                        split[1],
                        split[2]
                );
            }
        });
        //第二步动态构造元数据
        //id,name等,field的名称和类型,可能都是在程序运行的过程中,动态从mysql里
        //或者是配置文件中,加载的数据,是不固定的
        //所以特别适合用这种编程的方式,来构造元数据
        ArrayList<StructField> structFields = new ArrayList<StructField>();
        structFields.add(DataTypes.createStructField("id",DataTypes.StringType,true));
        structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));
        structFields.add(DataTypes.createStructField("age",DataTypes.StringType,true));

        StructType structType = DataTypes.createStructType(structFields);

        //第三步,使用动态构建的元数据,将rdd转换成DataFrame
        DataFrame studentDF = sqlContext.createDataFrame(rowJavaRDD, structType);

        studentDF.registerTempTable("student");

        DataFrame sql = sqlContext.sql("select id,name,age from student where age<= 18");
        sql.show();
        //结果
        //+---+-----+---+
        //| id| name|age|
        //+---+-----+---+
        //|  1|  leo| 17|
        //|  2|marry| 17|
        //|  3| jack| 18|
        //+---+-----+---+


    }
}

scala写

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.{SparkConf, SparkContext, sql}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

import scala.language.postfixOps
/**
  * 以编程方式动态指定元数据,将RDD转换成DataFrame
  */
object RDD2DataFrameBC  {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local").setAppName("RDD2DataFrameBC")

    val sc = new SparkContext(conf)

    val sQLContext = new SQLContext(sc)

    //第一步,构造出元数据为Row的数据
    val stuString: RDD[String] = sc.textFile("D:\\eclipse\\wc\\scalaworid\\students.txt")

    val stuRow: RDD[Row] = stuString.map({ line =>
      val s = line.split(",")
      //Row包需要手动导入
      Row(s(0).toInt, s(1), s(2).toInt)
    })

    // 第二部,编程方式动态构造元数据
    val structType = StructType(Array(
      StructField("id", IntegerType, true),
      StructField("name", StringType, true),
      StructField("age", IntegerType, true)
    ))

   val stuDF = sQLContext.createDataFrame(stuRow,structType)
   stuDF.registerTempTable("student")

   val frame: DataFrame = sQLContext.sql("select id,name,age from student where age >= 17")

  frame.show()
    
    //结果
    //+---+-----+---+
    //| id| name|age|
    //+---+-----+---+
    //|  1|  leo| 17|
    //|  2|marry| 17|
    //|  3| jack| 18|
    //|  4|  tom| 19|
    //+---+-----+---+
  }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值