使用反射的方式将RDD转换成DataFrame
java写
//student类
public class student {
private String id;
private String name;
private String age;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getAge() {
return age;
}
public void setAge(String age) {
this.age = age;
}
@Override
public String toString() {
return "student{" +
"id='" + id + '\'' +
", name='" + name + '\'' +
", age='" + age + '\'' +
'}';
}
}
//主类
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.codehaus.jackson.map.ObjectMapper;
/**
* 使用反射的方式将RDD转换成DataFrame
*/
public class Rdd2DataFrameRefection {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("Rdd2DataFrameRefection")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
//{"id":1, "name":"leo", "age":18}
JavaRDD<String> javaRDD = sc.textFile("D:\\eclipse\\wc\\scalaworid\\students(2).json");
JavaRDD<student> students = javaRDD.map(new Function<String, student>() {
@Override
public student call(String s) throws Exception {
ObjectMapper objectMapper = new ObjectMapper();
student student = objectMapper.readValue(s, student.class);
return student;
}
});
//使用反射方式,将RDD转换成DataFrame
//将Student.class传入进去,其实就是使用反射的方式来创建DataFrame
//因为Student.class本身就是反射的一个应用
//然后底层还得通过对Student Class进行反射,来获取其中的field
DataFrame dataFrame = sqlContext.createDataFrame(students, student.class);
//拿到了一个DataFrame之后,就可以注册一个临时表,然后针对其中的数据进行SQL查询
dataFrame.registerTempTable("student");
//针对students临时表执行sql语句
//查询年龄小于18岁的学生,即使teenager
String sql = "select id,name,age from student where age <=18 ";
DataFrame sql1 = sqlContext.sql(sql);
sql1.show();
//结果
//+---+-----+---+
//| id| name|age|
//+---+-----+---+
//| 1| leo| 18|
//| 3|marry| 17|
//+---+-----+---+
}
}
scala写
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* 如果要用scala开发spark程序
* 然后在其中,还要实现基于反射的RDD到DataFrame的转换,就必须得用object extends App的方式
* 不能用def main()方法的方式,来运行程序,否则就会报no typetag for ...class的错误
*
* @author Administrator
*/
object RDD2DataFrameReflection extends App {
val conf: SparkConf = new SparkConf().setAppName("RDD2DataFrameReflection").setMaster("local")
val sc = new SparkContext(conf)
val sQLContext = new SQLContext(sc)
// 在Scala中使用反射方式,进行RDD到DataFrame的转换,需要手动导入一个隐式转换
import sQLContext.implicits._
case class student(id:Int,name:String,age:Int)
//1,leo,17
// val stuString: RDD[String] = sc.textFile("D:\\eclipse\\wc\\scalaworid\\students.txt")
//
// //1 leo 17
// val stuArray: RDD[Array[String]] = stuString.map(_.split(","))
//
// val students: RDD[student] = stuArray.map(arr =>student(arr(0).trim.toInt,arr(1),arr(2).trim.toInt))
//
// val stuDF: DataFrame = students.toDF()
//将上面的四行代码合并到一起
val stuDF: DataFrame = sc.textFile("D:\\eclipse\\wc\\scalaworid\\students.txt")
.map(_.split(",")).map(arr => student(arr(0).toInt, arr(1), arr(2).toInt))
.toDF()
//注册临时表
stuDF.registerTempTable("sthdent")
val dataFrame: DataFrame = sQLContext.sql("select id,name,age from sthdent where age <= 18")
dataFrame.show()
//结果
//+---+-----+---+
//| id| name|age|
//+---+-----+---+
//| 1| leo| 17|
//| 2|marry| 17|
//| 3| jack| 18|
//+---+-----+---+
}
以编程方式动态指定元数据,将RDD转换成DataFrame
java写
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
/**
* 以编程方式动态指定元数据,将RDD转换成DataFrame
*/
public class RDD2DataFrameFromBC {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setMaster("local")
.setAppName("RDD2DataFrameFromBC");
final JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
//第一步,创建一个普通的RDD
JavaRDD<String> lines = sc.textFile("D:\\eclipse\\wc\\scalaworid\\students.txt");
JavaRDD<Row> rowJavaRDD = lines.map(new Function<String, Row>() {
@Override
public Row call(String line) throws Exception {
String[] split = line.split(",");
return RowFactory.create(
split[0],
split[1],
split[2]
);
}
});
//第二步动态构造元数据
//id,name等,field的名称和类型,可能都是在程序运行的过程中,动态从mysql里
//或者是配置文件中,加载的数据,是不固定的
//所以特别适合用这种编程的方式,来构造元数据
ArrayList<StructField> structFields = new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("id",DataTypes.StringType,true));
structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));
structFields.add(DataTypes.createStructField("age",DataTypes.StringType,true));
StructType structType = DataTypes.createStructType(structFields);
//第三步,使用动态构建的元数据,将rdd转换成DataFrame
DataFrame studentDF = sqlContext.createDataFrame(rowJavaRDD, structType);
studentDF.registerTempTable("student");
DataFrame sql = sqlContext.sql("select id,name,age from student where age<= 18");
sql.show();
//结果
//+---+-----+---+
//| id| name|age|
//+---+-----+---+
//| 1| leo| 17|
//| 2|marry| 17|
//| 3| jack| 18|
//+---+-----+---+
}
}
scala写
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.{SparkConf, SparkContext, sql}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import scala.language.postfixOps
/**
* 以编程方式动态指定元数据,将RDD转换成DataFrame
*/
object RDD2DataFrameBC {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local").setAppName("RDD2DataFrameBC")
val sc = new SparkContext(conf)
val sQLContext = new SQLContext(sc)
//第一步,构造出元数据为Row的数据
val stuString: RDD[String] = sc.textFile("D:\\eclipse\\wc\\scalaworid\\students.txt")
val stuRow: RDD[Row] = stuString.map({ line =>
val s = line.split(",")
//Row包需要手动导入
Row(s(0).toInt, s(1), s(2).toInt)
})
// 第二部,编程方式动态构造元数据
val structType = StructType(Array(
StructField("id", IntegerType, true),
StructField("name", StringType, true),
StructField("age", IntegerType, true)
))
val stuDF = sQLContext.createDataFrame(stuRow,structType)
stuDF.registerTempTable("student")
val frame: DataFrame = sQLContext.sql("select id,name,age from student where age >= 17")
frame.show()
//结果
//+---+-----+---+
//| id| name|age|
//+---+-----+---+
//| 1| leo| 17|
//| 2|marry| 17|
//| 3| jack| 18|
//| 4| tom| 19|
//+---+-----+---+
}
}