需求:
1、从外部文件读取json数据
2、根据需求拆分数据
3、利用DataFrame直接写入MongoDB
Spark-Mongodb官网写入MongoDB实例
采用官网实例的方案实验,不成功,且json数据中部分字段为空,读取报错。
import com.mongodb.spark.MongoSpark;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.SparkSession;
import org.bson.Document;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.json.JSONObject;
import com.mongodb.spark.config.WriteConfig;
import org.apache.spark.api.java.function.Function;
import java.util.HashMap;
import java.util.Map;
import mongo.Book;
public class Mongo {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder()
.appName("mongo")
.master("local[4]")
.config("spark.mongodb.input.url","mongodb://127.0.0.1/spark.mongo")
.config("spark.mongodb.output.uri","mongodb://127.0.0.1/spark.mongoœ")
.getOrCreate();
JavaRDD<String> input = spark.sparkContext()
.textFile("/Users/yangyang/Desktop/json/part-r-00003.txt",1)
.toJavaRDD().map(x -> x.split("____")[1]);
Map<String, String> writeOverrides = new HashMap<String, String>();
writeOverrides.put("collection", "spark");
writeOverrides.put("writeConcern.w", "majority");
WriteConfig writeConfig = WriteConfig.create(spark).withOptions(writeOverrides);
JavaRDD<Book> books = input.map(new Function<String, Book>() {
public Book call(String s) throws Exception {
JSONObject jsons = new JSONObject(s);
Book book = new Book();
book.setBookId(jsons.get("bookId").toString());
book.setContent(jsons.get("content").toString());
book.setContentStartPos(jsons.get("contentStartPos").toString());
book.setCoord(jsons.get("coord").toString());
book.setId(jsons.get("id").toString());
book.setLineColor(jsons.get("lineColor").toString());
book.setLineType(jsons.get("lineType").toString());
book.setLineWidth(jsons.get("lineWidth").toString());
book.setNoteCatalog(jsons.get("noteCatalog").toString());
book.setNoteLabels(jsons.get("noteLabels").toString());
book.setNoteOrigin(jsons.get("noteOrigin").toString());
book.setNotePath(jsons.get("notePath").toString());
book.setNotePostil(jsons.get("notePostil").toString());
book.setNoteType(jsons.get("noteType").toString());
book.setPageAngle(jsons.get("pageAngle").toString());
book.setPageHeight(jsons.get("pageHeight").toString());
book.setPageIndex(jsons.get("pageIndex").toString());
book.setPageWidth(jsons.get("pageWidth").toString());
book.setPdfId(jsons.get("pdfId").toString());
book.setUpdateTime(jsons.get("updateTime").toString());
book.setUserName(jsons.get("userName").toString());
book.setSourceType(jsons.get("sourceType").toString());
return book;
}
});
MongoSpark.save(books,Book.class);
spark.close();
}
}
研究了下MongoSpark的API,API里就有转化json RDD的方法。
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import com.mongodb.spark.MongoSpark;
public class DataSetMongo {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder()
.appName("mongo")
.master("local[4]")
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/spark.mongo")
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/spark.momgo")
.getOrCreate();
JavaRDD<String> input = spark.sparkContext()
.textFile("/Users/yangyang/Desktop/json/part-r-00003.txt",1)
.toJavaRDD().map(x -> x.split("____")[1]);
Dataset<Row> books = MongoSpark.read(spark).json(input);
books.show();
MongoSpark.write(books).option("collection", "mongo").mode("overwrite").save();;
spark.close();
}
}
运行后MongoDB数据库数据截图如下: