因项目需要,以http请求调用spark api,并以集群模式运行。(因踩坑掉了几根头发,所以特此记录。。。)
一、项目测试环境
1、jdk1.8
2、spark 2.3.1
3、idea、win10、centos7
我是win10,在idea上启动的项目,然后起了3个虚拟机
二、创建并配置SpringBoot项目
1、创建springboot项目(略)。
2、添加pom依赖,我所使用的所有依赖,见文章末尾附1。
3、在application.yml中添加spark配置信息。
spark:
app:
name: yl
home: 127.0.0.1
master:
uri: spark://host03:7077
driver:
memory: 2g
worker:
memory: 2g
executor:
memory: 1g
rpc:
message:
maxSize: 1024
4、添加spark配置文件,将sparksession交由spring容器管理。
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.support.PropertySourcesPlaceholderConfigurer;
import org.springframework.core.env.Environment;
@Configuration
public class SparkConfig {
@Autowired
private Environment env;
@Value("${spark.app.name}")
private String appName;
@Value("${spark.home}")
private String sparkHome;
@Value("${spark.master.uri}")
private String sparkMasterUri;
@Value("${spark.driver.memory}")
private String sparkDriverMemory;
@Value("${spark.worker.memory}")
private String sparkWorkerMemory;
@Value("${spark.executor.memory}")
private String sparkExecutorMemory;
@Value("${spark.rpc.message.maxSize}")
private String sparkRpcMessageMaxSize;
@Bean
public SparkConf sparkConf() {
SparkConf sparkConf = new SparkConf()
.setAppName(appName)
.setMaster(sparkMasterUri)
.set("spark.driver.memory",sparkDriverMemory)
.set("spark.worker.memory",sparkWorkerMemory) //"26g"
.set("spark.executor.memory",sparkExecutorMemory)
.set("spark.rpc.message.