package com.mongodb.spark_examples;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import com.mongodb.spark.MongoSpark;
public final class DatasetSQLDemo {
public static void main(final String[] args) throws InterruptedException, IOException, AnalysisException {
SparkSession spark = SparkSession.builder().master("local").appName("MongoSparkConnectorIntro")
.config("spark.mongodb.input.uri",
"mongodb://ip/db.collection")
.config("spark.mongodb.output.uri",
"mongodb://ip/db.collection")
.getOrCreate();
// Create a JavaSparkContext using the SparkSession's SparkContext object
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
// Load data and infer schema, disregard toDF() name as it returns Dataset
// Dataset<Row> implicitDS = MongoSpark.load(jsc).toDF();
// implicitDS.printSchema();
// implicitDS.show();
// Load data with explicit schema
Dataset<Character> explicitDS = MongoSpark.load(jsc).toDS(Character.class);
explicitDS.printSchema();
explicitDS.show();
explicitDS.createOrReplaceTempView("characters");
// Create the temp view and execute the query
List<String[]> result = runSparkPostgresqlJdbc(spark);
doExchange(result, spark.cloneSession());
// explicitDS.createOrReplaceTempView("characters");
// Dataset<Row> centenarians = spark.sql("SELECT * FROM characters c WHERE c.commparamid = 'DEV020409-PRM01'");
// centenarians.show();
// Write the data to the "hundredClub" collection
// MongoSpark.write(centenarians).option("collection", "hundredClub").mode("overwrite").save();
// MongoSpark.write(centenarians).mode("overwrite").save();
// MongoSpark.write(centenarians).mode("append").save();
// Load the data from the "hundredClub" collection
// MongoSpark.load(spark.cloneSession(),
// ReadConfig.create(spark.cloneSession()).withOption("collection", "loopxuClean"), Character.class)
// .limit(1000).show();
jsc.close();
}
private static List<String[]> runSparkPostgresqlJdbc(SparkSession spark) throws AnalysisException {
// new一个属性
System.out.println("确保数据库已经开启,并创建了products表和插入了数据");
Properties connectionProperties = new Properties();
// 增加数据库的用户名(user)密码(password),指定postgresql驱动(driver)
System.out.println("增加数据库的用户名(user)密码(password),指定postgresql驱动(driver)");
connectionProperties.put("user", "user");
connectionProperties.put("password", "password");
connectionProperties.put("driver", "org.postgresql.Driver");
// SparkJdbc读取Postgresql的products表内容
System.out.println("SparkJdbc读取Postgresql的products表内容");
Dataset<Row> jdbcDF = spark.read().jdbc("jdbc:postgresql://ip/db", "view_mapping_mongo",
connectionProperties);
// 显示jdbcDF数据内容
jdbcDF.createOrReplaceTempView("view_mapping_mongo");
jdbcDF.show();
List<Row> list = jdbcDF.collectAsList();
Set<String> projids = new HashSet<String>();
Set<String> areaids = new HashSet<String>();
Set<String> commdeviceids = new HashSet<String>();
Set<String> commparamids = new HashSet<String>();
Set<String> bizdeviceids = new HashSet<String>();
Set<String> bizparamids = new HashSet<String>();
for (Row r : list) {
projids.add(r.getString(0));
areaids.add(r.getString(2));
commdeviceids.add(r.getString(4));
commparamids.add(r.getString(7));
bizdeviceids.add(r.getString(5));
bizparamids.add(r.getString(8));
}
List<String[]> result = new ArrayList<String[]>();
result.add(projids.toArray(new String[projids.size()]));
result.add(areaids.toArray(new String[areaids.size()]));
result.add(commdeviceids.toArray(new String[commdeviceids.size()]));
result.add(commparamids.toArray(new String[commparamids.size()]));
result.add(bizdeviceids.toArray(new String[bizdeviceids.size()]));
result.add(bizparamids.toArray(new String[bizparamids.size()]));
return result;
}
public static void doExchange(List arrayLists, SparkSession spark) {
int len = arrayLists.size();
// 判断数组size是否小于2,如果小于说明已经递归完成了,否则你们懂得的,不懂?断续看代码
if (len < 2) {
// this.arrayLists = arrayLists;
return;
}
// 拿到第一个数组
int len0;
if (arrayLists.get(0) instanceof String[]) {
String[] arr0 = (String[]) arrayLists.get(0);
len0 = arr0.length;
} else {
len0 = ((ArrayList<String>) arrayLists.get(0)).size();
}
// 拿到第二个数组
String[] arr1 = (String[]) arrayLists.get(1);
int len1 = arr1.length;
// 计算当前两个数组一共能够组成多少个组合
int lenBoth = len0 * len1;
// 定义临时存放排列数据的集合
ArrayList<ArrayList<String>> tempArrayLists = new ArrayList<ArrayList<String>>(lenBoth);
// 第一层for就是循环arrayLists第一个元素的
for (int i = 0; i < len0; i++) {
// 第二层for就是循环arrayLists第二个元素的
for (int j = 0; j < len1; j++) {
// 判断第一个元素如果是数组说明,循环才刚开始
if (arrayLists.get(0) instanceof String[]) {
String[] arr0 = (String[]) arrayLists.get(0);
ArrayList<String> arr = new ArrayList<String>();
arr.add(arr0[i]);
arr.add(arr1[j]);
// 把排列数据加到临时的集合中
String resultTable = "";
for (String str : arr) {
resultTable = resultTable.concat(str);
}
// System.out.println(resultTable);
Dataset<Row> centenarians = spark.sql("SELECT * FROM characters c WHERE c.projid = '" + arr0[i]
+ "' AND c.areaid='" + arr1[j] + "'");
centenarians.show();
MongoSpark.write(centenarians).option("collection", resultTable).mode("overwrite").save();
tempArrayLists.add(arr);
} else {
// 到这里就明循环了最少一轮啦,我们把上一轮的结果拿出来继续跟arrayLists的下一个元素排列
ArrayList<ArrayList<String>> arrtemp = (ArrayList<ArrayList<String>>) arrayLists.get(0);
ArrayList<String> arr = new ArrayList<String>();
for (int k = 0; k < arrtemp.get(i).size(); k++) {
arr.add(arrtemp.get(i).get(k));
}
arr.add(arr1[j]);
String resultTable = "";
for (String str : arr) {
resultTable = resultTable.concat(str);
}
String sql = "";
switch (arr.size()) {
case 3:
sql = "SELECT * FROM characters c WHERE c.projid = '" + arr.get(0) + "' AND c.areaid='"
+ arr.get(1) + "' AND c.commdeviceid ='" + arr.get(2) + "'";
break;
case 4:
sql = "SELECT * FROM characters c WHERE c.projid = '" + arr.get(0) + "' AND c.areaid='"
+ arr.get(1) + "' AND c.commdeviceid ='" + arr.get(2) + "' AND c.commparamid ='"
+ arr.get(3) + "'";
break;
case 5:
sql = "SELECT * FROM characters c WHERE c.projid = '" + arr.get(0) + "' AND c.areaid='"
+ arr.get(1) + "' AND c.commdeviceid ='" + arr.get(2) + "' AND c.commparamid ='"
+ arr.get(3) + "' AND c.bizdevicecode ='" + arr.get(4) + "'";
case 6:
sql = "SELECT * FROM characters c WHERE c.projid = '" + arr.get(0) + "' AND c.areaid='"
+ arr.get(1) + "' AND c.commdeviceid ='" + arr.get(2) + "' AND c.commparamid ='"
+ arr.get(3) + "' AND c.bizdevicecode ='" + arr.get(4) + "' AND c.bizparamid ='"
+ arr.get(5) + "'";
break;
default:
break;
}
Dataset<Row> centenarians = spark.sql(sql);
centenarians.show();
MongoSpark.write(centenarians).option("collection", resultTable).mode("overwrite").save();
tempArrayLists.add(arr);
}
}
}
// 这是根据上面排列的结果重新生成的一个集合
List newArrayLists = new ArrayList<Object>();
// 把还没排列的数组装进来,看清楚i=2的喔,因为前面两个数组已经完事了,不需要再加进来了
for (int i = 2; i < arrayLists.size(); i++) {
newArrayLists.add(arrayLists.get(i));
}
// 记得把我们辛苦排列的数据加到新集合的第一位喔,不然白忙了
newArrayLists.add(0, tempArrayLists);
// System.out.println(tempArrayLists);
// 你没看错,我们这整个算法用到的就是递归的思想。
doExchange(newArrayLists, spark);
}
}