第67课:SparkSQL下案例综合实战学习笔记
1 SparkSQL案例分析
2 通过Java和Scala实现案例
本课直接通过实战练习SparkSQL下的Join操作:
先用Java编写代码:
package SparkSQLByJava;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
public class SparkSQLwithJoin {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("SparkSQLwithJoin");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
//针对json文件数据源来创建DataFrame
DataFrame peoplesDF = sqlContext.read().json("D:\\DT-IMF\\testdata\\peoples.json");
//基于Json构建的DataFrame来注册临时表
peoplesDF.registerTempTable("peopleScores");
//查询出分数大于90的人
DataFrame excellentScoresDF = sqlContext.sql("select name,score from peopleScores where score >90");
/**
* 在DataFrame的基础上转化成为RDD,通过Map操作计算出分数大于90的所有人的姓名
*/
List<String> execellentScoresNameList = excellentScoresDF.javaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) throws Exception {
return row.getAs("name");
}
}).collect();
//动态组拼出JSON
List<String> peopleInformations = new ArrayList<String>();
peopleInformations.add("{\"name\":\"Michael\", \"age\":20}");
peopleInformations.add("{\"name\":\"Andy\", \"age\":17}");
peopleInformations.add("{\"name\":\"Justin\", \"age\":19}");
//通过内容为JSON的RDD来构造DataFrame
JavaRDD<String> peopleInformationsRDD = sc.parallelize(peopleInformations);
DataFrame peopleInformationsDF = sqlContext.read().json(peopleInformationsRDD);
//注册成为临时表
peopleInformationsDF.registerTempTable("peopleInformations");
String sqlText = "select name, age from peopleInformations where name in (";
for(int i =0; i < execellentScoresNameList.size(); i++){
sqlText += "'" + execellentScoresNameList.get(i) + "'";
if (i < execellentScoresNameList.size()-1){
sqlText += ",";
}
}
sqlText += ")";
DataFrame execellentNameAgeDF = sqlContext.sql(sqlText);
JavaPairRDD<String, Tuple2<Integer, Integer>> resultRDD = excellentScoresDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<String, Integer>(row.getAs("name"), (int) row.getLong(1));
}
}).join(execellentNameAgeDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<String, Integer>(row.getAs("name"), (int) row.getLong(1));
}
}));
JavaRDD<Row> reusltRowRDD = resultRDD.map(new Function<Tuple2<String,Tuple2<Integer,Integer>>, Row>() {
@Override
public Row call(Tuple2<String, Tuple2<Integer, Integer>> tuple) throws Exception {
// TODO Auto-generated method stub
return RowFactory.create(tuple._1, tuple._2._2,tuple._2._1 );
}
});
List<StructField> structFields = new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));
structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true));
//构建StructType,用于最后DataFrame元数据的描述
StructType structType =DataTypes.createStructType(structFields);
DataFrame personsDF = sqlContext.createDataFrame(reusltRowRDD, structType);
personsDF.show();
personsDF.write().format("json").save("D:\\DT-IMF\\testdata\\peopleresult");
}
}
在eclipse中运行时的console:
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
16/04/08 00:01:27 INFO SparkContext: Running Spark version 1.6.0
16/04/08 00:01:29 INFO SecurityManager: Changing view acls to: think
16/04/08 00:01:29 INFO SecurityManager: Changing modify acls to: think
16/04/08 00:01:29 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(think); users with modify permissions: Set(think)
16/04/08 00:01:32 INFO Utils: Successfully started service 'sparkDriver' on port 52189.
16/04/08 00:01:33 INFO Slf4jLogger: Slf4jLogger started
16/04/08 00:01:33 INFO Remoting: Starting remoting
16/04/08 00:01:34 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem@192.168.56.1:52202]
16/04/08 00:01:34 INFO Utils: Successfully started service 'sparkDriverActorSystem' on port 52202.
16/04/08 00:01:34 INFO SparkEnv: Registering MapOutputTracker
16/04/08 00:01:34 INFO SparkEnv: Registering BlockManagerMaster
16/04/08 00:01:34 INFO DiskBlockManager: Created local directory at C:\Users\think\AppData\Local\Temp\blockmgr-0efb49ea-2c12-4819-8543-efcad6cbe9ee
16/04/08 00:01:34 INFO MemoryStore: MemoryStore started with capacity 1773.8 MB
16/04/08 00:01:35 INFO SparkEnv: Registering OutputCommitCoordinator
16/04/08 00:01:36 INFO Utils: Successfully started service 'SparkUI' on port 4040.
16/04/08 00:01:36 INFO SparkUI: Started SparkUI at http://192.168.56.1:4040
16/04/08 00:01:36 INFO Executor: Starting executor ID driver on host localhost
16/04/08 00:01:36 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 52209.
16/04/08 00:01:36 INFO NettyBlockTransferService: Server created on 52209
16/04/08 00:01:36 INFO BlockManagerMaster: Trying to register BlockManager
16/04/08 00:01:36 INFO BlockManagerMasterEndpoint: Registering block manager localhost:52209 with 1773.8 MB RAM, BlockManagerId(driver, localhost, 52209)
16/04/08 00:01:36 INFO BlockManagerMaster: Registered BlockManager
16/04/08 00:01:40 WARN : Your hostname, think-PC resolves to a loopback/non-reachable address: fe80:0:0:0:d401:a5b5:2103:6d13%eth8, but we couldn't find any external IP address!
16/04/08 00:01:41 INFO JSONRelation: Listing file:/D:/DT-IMF/testdata/peoples.json on driver
16/04/08 00:01:42 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 208.9 KB, free 208.9 KB)
16/04/08 00:01:43 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 19.4 KB, free 228.3 KB)
16/04/08 00:01:43 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on localhost:52209 (size: 19.4 KB, free: 1773.7 MB)
16/04/08 00:01:43 INFO SparkContext: Created broadcast 0 from json at SparkSQLwithJoin.java:28
16/04/08 00:01:43 INFO FileInputFormat: Total input paths to process : 1
16/04/08 00:01:43 INFO SparkContext: Starting job: json at SparkSQLwithJoin.java:28
16/04/08 00:01:43 INFO DAGScheduler: Got job 0 (json at SparkSQLwithJoin.java:28) with 1 output partitions
16/04/08 00:01:43 INFO DAGScheduler: Final stage: ResultStage 0 (json at SparkSQLwithJoin.java:28)