spark2.1.0-mongodb

本文介绍如何使用MongoDB与Apache Spark进行数据交互,包括从MongoDB读取数据到Spark RDD,将Spark RDD数据写回MongoDB,以及利用Spark SQL进行数据集操作和SQL查询。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1.从MongoDB读取

package com.mongodb.spark;

import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.bson.Document;

import com.mongodb.spark.MongoSpark;
import com.mongodb.spark.rdd.api.java.JavaMongoRDD;

public final class ReadFromMongoDB {

	public static void main(final String[] args) throws InterruptedException {

		SparkSession spark = SparkSession.builder().master("local").appName("MongoSparkConnectorIntro")
				.config("spark.mongodb.input.uri", "mongodb://172.28.34.xxx:27117/wangzs.zhaopin")
				.config("spark.mongodb.output.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparkmongo").getOrCreate();

		// Create a JavaSparkContext using the SparkSession's SparkContext object
		JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

		/* Start Example: Read data from MongoDB ************************/
		JavaMongoRDD<Document> rdd = MongoSpark.load(jsc);
		/* End Example **************************************************/

		// Analyze data from MongoDB
		System.out.println(rdd.count());
		System.out.println(rdd.first().toJson());

		jsc.close();

	}
}

2.写入MongoDB

package com.mongodb.spark;

import com.mongodb.spark.MongoSpark;
import com.mongodb.spark.config.WriteConfig;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.SparkSession;

import org.bson.Document;

import static java.util.Arrays.asList;

import java.util.HashMap;
import java.util.Map;

public final class WriteToMongoDBWriteConfig {

	public static void main(final String[] args) throws InterruptedException {

		SparkSession spark = SparkSession.builder().master("local").appName("MongoSparkConnectorIntro")
				.config("spark.mongodb.input.uri", "mongodb://172.28.34.xxx:27117/wangzs.zhaopin")
				.config("spark.mongodb.output.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparkmongo").getOrCreate();

		JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

		// Create a custom WriteConfig
		Map<String, String> writeOverrides = new HashMap<String, String>();
		writeOverrides.put("collection", "spark");
		writeOverrides.put("writeConcern.w", "majority");
		WriteConfig writeConfig = WriteConfig.create(jsc).withOptions(writeOverrides);

		// Create a RDD of 10 documents
		JavaRDD<Document> sparkDocuments = jsc.parallelize(asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
				.map(new Function<Integer, Document>() {
					public Document call(final Integer i) throws Exception {
						return Document.parse("{spark: " + i + ",name:" + i + "}");
					}
				});

		/* Start Example: Save data from RDD to MongoDB *****************/
		MongoSpark.save(sparkDocuments, writeConfig);
		/* End Example **************************************************/

		jsc.close();

	}

}

3.聚合

package com.mongodb.spark;

import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.bson.Document;

import com.mongodb.spark.MongoSpark;
import com.mongodb.spark.rdd.api.java.JavaMongoRDD;

import static java.util.Collections.singletonList;

public final class Aggregation {

	public static void main(final String[] args) throws InterruptedException {

		SparkSession spark = SparkSession.builder().master("local").appName("Aggregation")
				.config("spark.mongodb.input.uri", "mongodb://172.28.34.xxx:27117/wangzs.zhaopin")
				.config("spark.mongodb.output.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparkmongo").getOrCreate();

		// Create a JavaSparkContext using the SparkSession's SparkContext object
		JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

		// Load and analyze data from MongoDB
		JavaMongoRDD<Document> rdd = MongoSpark.load(jsc);

		/* Start Example: Use aggregation to filter a RDD ***************/
		JavaMongoRDD<Document> aggregatedRdd = rdd
				.withPipeline(singletonList(Document.parse("{ $match: { 'gzdd' : '上海-普陀区' } }")));
				/* End Example **************************************************/

		// Analyze data from MongoDB
		System.out.println(aggregatedRdd.count());
		System.out.println(aggregatedRdd.collect());

		jsc.close();

	}
}

4.数据集和SQL

/* 1 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac96"),
    "name" : "Bilbo Baggins",
    "age" : 50.0
}

/* 2 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac97"),
    "name" : "Gandalf",
    "age" : 1000.0
}

/* 3 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac98"),
    "name" : "Thorin",
    "age" : 195.0
}

/* 4 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac99"),
    "name" : "Balin",
    "age" : 178.0
}

/* 5 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac9a"),
    "name" : "Kíli",
    "age" : 77.0
}

/* 6 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac9b"),
    "name" : "Dwalin",
    "age" : 169.0
}

/* 7 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac9c"),
    "name" : "Óin",
    "age" : 167.0
}

/* 8 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac9d"),
    "name" : "Glóin",
    "age" : 158.0
}

/* 9 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac9e"),
    "name" : "Fíli",
    "age" : 82.0
}

/* 10 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac9f"),
    "name" : "Bombur"
}
package com.mongodb.spark;

import java.io.Serializable;

public final class Character implements Serializable {
	private String name;
	private Integer age;

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public Integer getAge() {
		return age;
	}

	public void setAge(final Integer age) {
		this.age = age;
	}
}

package com.mongodb.spark;

import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

public final class DatasetSQLDemo {

	public static void main(final String[] args) throws InterruptedException {

		SparkSession spark = SparkSession.builder().master("local").appName("Aggregation")
				.config("spark.mongodb.input.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparktest")
				.config("spark.mongodb.output.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparkmongo").getOrCreate();

		// Create a JavaSparkContext using the SparkSession's SparkContext object
		JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

		// Load data with explicit schema
		Dataset<Character> explicitDS = MongoSpark.load(jsc).toDS(Character.class);
		explicitDS.printSchema();
		explicitDS.show();

		// Create the temp view and execute the query
		explicitDS.createOrReplaceTempView("characters");
		Dataset<Row> centenarians = spark.sql("SELECT name, age FROM characters WHERE age >= 100");
		centenarians.show();

		// Write the data to the "hundredClub" collection
		MongoSpark.write(centenarians).option("collection", "hundredClub").mode("overwrite").save();

		jsc.close();

	}
}

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>

	<groupId>com.wangzs</groupId>
	<artifactId>spark-2.1.0-learn</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>

	<name>spark-2.1.0-learn</name>
	<url>http://maven.apache.org</url>

	<dependencies>
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.12</version>
			<scope>test</scope>
		</dependency>

		<dependency> <!-- Spark dependency -->
			<groupId>org.apache.spark</groupId>
			<artifactId>spark-core_2.11</artifactId>
			<version>2.1.0</version>
		</dependency>

		<dependency>
			<groupId>org.mongodb.spark</groupId>
			<artifactId>mongo-spark-connector_2.11</artifactId>
			<version>2.1.0</version>
		</dependency>
		<dependency>
			<groupId>org.apache.spark</groupId>
			<artifactId>spark-sql_2.11</artifactId>
			<version>2.1.0</version>
		</dependency>

	</dependencies>

	<build>
		<pluginManagement>
			<plugins>
				<plugin>
					<groupId>org.apache.maven.plugins</groupId>
					<artifactId>maven-compiler-plugin</artifactId>
					<configuration>
						<source>1.8</source>
						<target>1.8</target>
					</configuration>
				</plugin>
				<plugin>
					<groupId>org.apache.maven.plugins</groupId>
					<artifactId>maven-resources-plugin</artifactId>
					<configuration>
						<encoding>UTF-8</encoding>
					</configuration>
				</plugin>
				<!-- 打包时跳过测试 -->
				<plugin>
					<groupId>org.apache.maven.plugins</groupId>
					<artifactId>maven-surefire-plugin</artifactId>
					<configuration>
						<skipTests>true</skipTests>
					</configuration>
				</plugin>
			</plugins>
		</pluginManagement>
	</build>
</project>

5.pom文件

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>

	<groupId>com.wangzs</groupId>
	<artifactId>spark-2.1.0-learn</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>

	<name>spark-2.1.0-learn</name>
	<url>http://maven.apache.org</url>

	<dependencies>
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.12</version>
			<scope>test</scope>
		</dependency>

		<dependency> <!-- Spark dependency -->
			<groupId>org.apache.spark</groupId>
			<artifactId>spark-core_2.11</artifactId>
			<version>2.1.0</version>
		</dependency>

		<dependency>
			<groupId>org.mongodb.spark</groupId>
			<artifactId>mongo-spark-connector_2.11</artifactId>
			<version>2.1.0</version>
		</dependency>
		<dependency>
			<groupId>org.apache.spark</groupId>
			<artifactId>spark-sql_2.11</artifactId>
			<version>2.1.0</version>
		</dependency>

	</dependencies>

	<build>
		<pluginManagement>
			<plugins>
				<plugin>
					<groupId>org.apache.maven.plugins</groupId>
					<artifactId>maven-compiler-plugin</artifactId>
					<configuration>
						<source>1.8</source>
						<target>1.8</target>
					</configuration>
				</plugin>
				<plugin>
					<groupId>org.apache.maven.plugins</groupId>
					<artifactId>maven-resources-plugin</artifactId>
					<configuration>
						<encoding>UTF-8</encoding>
					</configuration>
				</plugin>
				<!-- 打包时跳过测试 -->
				<plugin>
					<groupId>org.apache.maven.plugins</groupId>
					<artifactId>maven-surefire-plugin</artifactId>
					<configuration>
						<skipTests>true</skipTests>
					</configuration>
				</plugin>
			</plugins>
		</pluginManagement>
	</build>
</project>

<think>好的,我现在需要解决用户在Spark中遇到的注册相同插件导致的IllegalArgumentException错误,特别是涉及Datanucleus-core和HiveSessionState的问题。首先,我应该回忆一下Spark和Hive的依赖管理机制。 用户提到了IllegalArgumentException,通常这可能和类路径冲突有关。Datanucleus-core是Hive用来处理JPA和元数据存储的库,而Spark在集成Hive时可能需要特定版本的Datanucleus。如果项目中同时存在多个版本的Datanucleus-core,或者在启动时重复注册了插件,就会导致冲突。 接下来,我应该考虑如何检查依赖冲突。用户可能使用了Maven或SBT,这时候需要查看依赖树,排除重复的datanucleus-core依赖。例如,在Maven中可以用mvn dependency:tree来排查,并排除Hive或Spark中自带的版本。 另外,Spark的启动脚本中需要正确配置Datanucleus插件。用户需要确保在spark-defaults.conf或者命令行参数中正确设置jars参数,指向正确的Datanucleus JAR包,并且只加载一次。如果用户同时通过--jars参数和spark.jars配置添加了相同的JAR,就会导致重复注册。 还有,Hive的会话状态初始化失败可能和类加载器有关。Spark在启用Hive支持时,需要确保Hive的相关依赖被正确隔离,避免父类加载器加载了冲突的类。这时候可以检查是否使用了spark.sql.hive.metastore.version和spark.sql.hive.metastore.jars参数正确指定了Hive的版本和JAR包路径。 最后,用户可能需要清理缓存或临时文件,有时候旧的元数据缓存会导致类加载错误。建议用户尝试删除metastore_db目录和临时文件夹,然后重新启动应用程序。 总结一下,解决方案可能包括排除冲突的依赖、正确配置插件路径、检查类加载策略以及清理缓存。需要分步骤引导用户进行这些操作,并验证每个步骤后的结果。</think>### Spark中解决注册相同插件导致的IllegalArgumentException错误 #### 问题背景 在Spark中使用Hive时,可能会因重复加载`datanucleus-core`插件或版本冲突导致以下错误: ```java java.lang.IllegalArgumentException: Plugin (Bundle) "org.datanucleus" is already registered ``` 此问题通常由以下原因引起: 1. **依赖冲突**:Spark和Hive依赖不同版本的`datanucleus-core`。 2. **重复注册**:通过`--jars`或`spark.jars`多次加载同一插件。 3. **类加载器问题**:父类加载器优先加载了旧版本的类[^1]。 --- #### 解决方案步骤 ##### 1. **排除冲突的依赖** 在构建工具(如Maven/SBT)中显式排除冗余的`datanucleus-core`依赖。 **示例(Maven)**: ```xml <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.12</artifactId> <version>3.3.0</version> <exclusions> <exclusion> <groupId>org.datanucleus</groupId> <artifactId>datanucleus-core</artifactId> </exclusion> </exclusions> </dependency> ``` ##### 2. **统一插件版本** 手动添加兼容的`datanucleus-core`版本(例如4.1.17)到Spark的类路径: ```bash spark-shell --jars /path/to/datanucleus-core-4.1.17.jar ``` ##### 3. **配置Hive元存储路径** 在`spark-defaults.conf`中明确指定Hive元存储的版本和JAR路径: ```properties spark.sql.hive.metastore.version 3.1.2 spark.sql.hive.metastore.jars /path/to/hive-metastore-jars/* ``` ##### 4. **清理临时文件** 删除本地生成的元数据缓存和临时文件: ```bash rm -r metastore_db rm -r spark-warehouse ``` --- #### 验证方法 1. 检查依赖树: ```bash mvn dependency:tree | grep datanucleus ``` 2. 查看Spark日志:确保只加载了一次`datanucleus-core`插件。 ---
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值