配置系统环境
修改/etc/hosts文件
192.168.36.128 master
192.168.36.129 worker1
192.168.36.130 worker2
192.168.36.128,192.168.36.129,192.168.36.130都是Ubuntu20 Server LTS,都是虚拟机。
除了需要在hosts文件中修改之外,每台机器需要用以下命令修改主机名,否则,在使用192.168.36.128:50070查看hadoop集群时会发现,overview面板中的live nodes数量为2(两个worker),datanodes面板中的node只有1个,原因可能是因为两个worker的主机名重复
hostnamectl set-hostname master
然后需要reboot重启。
安装ssh并实现无密码登录
确认在每台机器上都安装ssh,确认后对ssh进行配置。
修改ssh的配置文件如下,配置文件的位置是/etc/ssh/sshd_config
Port 22
PermitRootLogin yes
PubkeyAuthentication yes
PasswordAuthentication yes
UsePAM yes
配置修改完成后重启sshd服务
systemctl restart sshd
在master上生成ssh秘钥,公钥,并将公钥拷贝到每个worker节点,这里是worker1和worker2(hadoop4…)
ssh-keygen -t rsa //注:在三次需要填密码的地方直接Enter
scp ~/.ssh/id_rsa.pub root@worker1:~/.ssh/
注意:如果某个节点的~/.ssh/不存在,可以自己用ssh命令连接一下自己,那么该文件夹就会自动创建
在每个节点里面操作,生成authorized_keys
cat ~/.ssh/id_rsa.pub>>~/.ssh/authorized_keys
测试能不能使用ssh连自己
[root@localhost sbin]# ssh master
注意:如果出现下面的情况.
ECDSA host key for hadoop1 has changed and you have requested strict checking.
Host key verification failed.
则使用下面命令,原来的known_hosts文件会被保存为known_hosts.old,新的known_hosts会生成
[root@localhost sbin]# ssh-keygen -R master
# Host hadoop1 found: line 2
/root/.ssh/known_hosts updated.
Original contents retained as /root/.ssh/known_hosts.old
此时再连接,应该就可以成功了,对于hadoop2也做同样的ssh配置操作。
安装大数据组件
获取大数据组件
链接:https://pan.baidu.com/s/1oVtPeiKUnAfrcofHO5VfoQ
提取码:atbk
压缩包中包含:hadoop(3.3.4),jdk(1.8),scala(2.12.17),zookeeper(3.6.1),template文件夹,setup.sh
使用简易部署脚本setup.sh进行配置
在每台机器上执行简易部署脚本完成各个组件的配置
#!/bin/bash
function log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') ""$@"
}
CRTDIR=$(pwd)
JDK_DIR="$CRTDIR""/jdk"
HADOOP_DIR="$CRTDIR""/hadoop"
SCALA_DIR="$CRTDIR""/scala"
SPARK_DIR="$CRTDIR""/spark"
TEMPLATE_CONF_DIR="$CRTDIR""/template/"
ZK_DIR="$CRTDIR""/zookeeper"
MASTER="master"
WORKERS="worker1,worker2"
#"/etc/profile"
ENV="/etc/profile"
log "checking components jdk,hadoop,scala,spark,zookeeper..."
if [ ! -d $JDK_DIR ]; then
log "jdk is not found"
exit -1
fi
if [ ! -d $HADOOP_DIR ]; then
log "hadoop is not found"
exit -1
fi
if [ ! -d $SCALA_DIR ]; then
log "scala is not found"
exit -1
fi
if [ ! -d $SPARK_DIR ]; then
log "spark is not found"
exit -1
fi
if [ ! -d $ZK_DIR ]; then
log "zookeeper is not found"
exit -1
fi
log "Installing JDK..."
if [ -z $JAVA_HOME ]; then
log "JAVA_HOME is not found,using the JDK in the package to install..."
log "Configuring JDK's environment..."
echo "export JAVA_HOME=$JDK_DIR" >>$ENV
echo 'export CLASSPATH=$CLASSPATH:$JAVA_HOME/lib:$JAVA_HOME/jre/lib' >>$ENV
echo 'export PATH=$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$PATH:$HOME/bin' >>$ENV
else
log "JDK already existed"
fi
log "The JDK's environment was configured successfully"
source $ENV
java -version
#----------------------------------------------Hadoop start------------------------------------------------
log "Installing Hadoop..."
log "Configuring core-site.xml..."
FILE_NAME="core-site.xml"
CORE_SITE_TMP="$TEMPLATE_CONF_DIR""$FILE_NAME"
CORE_SITE_TEMPLATE="$CORE_SITE_TMP"".template"
cp $CORE_SITE_TEMPLATE $CORE_SITE_TMP
sed -i "s|MASTER|$MASTER|g" $CORE_SITE_TMP
sed -i "s|CRTDIR|$CRTDIR|g" $CORE_SITE_TMP
CORE_SITE="$HADOOP_DIR""/etc/hadoop/""$FILE_NAME"
CORE_SITE_BAK="$CORE_SITE"".bak"
rm -rf $CORE_SITE_BAK
mv $CORE_SITE $CORE_SITE_BAK
cp $CORE_SITE_TMP $CORE_SITE
log "Configuring hdfs-site.xml..."
FILE_NAME="hdfs-site.xml"
HDFS_SITE_TMP="$TEMPLATE_CONF_DIR""$FILE_NAME"
HDFS_SITE_TEMPLATE="$HDFS_SITE_TMP"".template"
cp $HDFS_SITE_TEMPLATE $HDFS_SITE_TMP
sed -i "s|MASTER|$MASTER|g" $HDFS_SITE_TMP
sed -i "s|CRTDIR|$CRTDIR|g" $HDFS_SITE_TMP
HDFS_SITE="$HADOOP_DIR""/etc/hadoop/""$FILE_NAME"
HDFS_SITE_BAK="$HDFS_SITE"".bak"
rm -rf $HDFS_SITE_BAK
mv $HDFS_SITE $HDFS_SITE_BAK
cp $HDFS_SITE_TMP $HDFS_SITE
log "Configuring mapred-site.xml..."
FILE_NAME="mapred-site.xml"
MAPRED_SITE_TMP="$TEMPLATE_CONF_DIR""$FILE_NAME"
MAPRED_SITE_TEMPLATE="$MAPRED_SITE_TMP"".template"
cp $MAPRED_SITE_TEMPLATE $MAPRED_SITE_TMP
sed -i "s|MASTER|$MASTER|g" $MAPRED_SITE_TMP
MAPRED_SITE="$HADOOP_DIR""/etc/hadoop/""$FILE_NAME"
MAPRED_SITE_BAK="$MAPRED_SITE"".bak"
rm -rf $MAPRED_SITE_BAK
mv $MAPRED_SITE $MAPRED_SITE_BAK
cp $MAPRED_SITE_TMP $MAPRED_SITE
log "Configuring yarn-site.xml..."
FILE_NAME="yarn-site.xml"
YARN_SITE_TMP="$TEMPLATE_CONF_DIR""$FILE_NAME"
YARN_SITE_TEMPLATE="$YARN_SITE_TMP"".template"
cp $YARN_SITE_TEMPLATE $YARN_SITE_TMP
sed -i "s|MASTER|$MASTER|g" $YARN_SITE_TMP
YARN_SITE="$HADOOP_DIR""/etc/hadoop/""$FILE_NAME"
YARN_SITE_BAK="$YARN_SITE"".bak"
rm -rf $YARN_SITE_BAK
mv $YARN_SITE $YARN_SITE_BAK
cp $YARN_SITE_TMP $YARN_SITE
log "Configuring workers file..."
FILE_NAME="workers"
WORKERS_FILE_TMP="$TEMPLATE_CONF_DIR""$FILE_NAME"
WORKERS_FILE_TEMPLATE="$WORKERS_FILE_TMP"".template"
cp $WORKERS_FILE_TEMPLATE $WORKERS_FILE_TMP
WORKERS_CONT=${WORKERS//,/\\n}
sed -i "s|WORKERS|$WORKERS_CONT|g" $WORKERS_FILE_TMP
WORKERS_FILE="$HADOOP_DIR""/etc/hadoop/""$FILE_NAME"
WORKERS_FILE_BAK="$WORKERS_FILE"".bak"
rm -rf $WORKERS_FILE_BAK
mv $WORKERS_FILE $WORKERS_FILE_BAK
cp $WORKERS_FILE_TMP $WORKERS_FILE
log "Configuring hadoop-env.sh..."
FILE_NAME="hadoop-env.sh"
HADOOP_ENV_TMP="$TEMPLATE_CONF_DIR""$FILE_NAME"
HADOOP_ENV_TEMPLATE="$HADOOP_ENV_TMP"".template"
cp $HADOOP_ENV_TEMPLATE $HADOOP_ENV_TMP
sed -i "s|JDKHOME|$JDK_DIR|g" $HADOOP_ENV_TMP
HADOOP_ENV_FILE="$HADOOP_DIR""/etc/hadoop/""$FILE_NAME"
HADOOP_ENV_BAK="$HADOOP_ENV_FILE"".bak"
rm -rf $HADOOP_ENV_BAK
mv $HADOOP_ENV_FILE $HADOOP_ENV_BAK
cp $HADOOP_ENV_TMP $HADOOP_ENV_FILE
log "Configuration files of Hadoop was configured successfully"
log "Configuring Hadoop environment..."
if [ -z $HADOOP_HOME ]; then
log "HADOOP_HOME is not found,configuring..."
echo "export HADOOP_HOME=$HADOOP_DIR" >>$ENV
echo 'export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin' >>$ENV
else
log "HADOOP_HOME already existed,skipping..."
fi
if [ -z $HDFS_DATANODE_USER ]; then
log "HDFS_DATANODE_USER is not found,configuring..."
echo "export HDFS_DATANODE_USER=root" >>$ENV
else
log "HDFS_DATANODE_USER already existed,skipping..."
fi
if [ -z $HDFS_DATANODE_SECURE_USER ]; then
log "HDFS_DATANODE_SECURE_USER is not found,configuring..."
echo "export HDFS_DATANODE_SECURE_USER=hdfs" >>$ENV
else
log "HDFS_DATANODE_SECURE_USER already existed,skipping..."
fi
if [ -z $HDFS_NAMENODE_USER ]; then
log "HDFS_NAMENODE_USER is not found,configuring..."
echo "export HDFS_NAMENODE_USER=root" >>$ENV
else
log "HDFS_NAMENODE_USER already existed,skipping..."
fi
if [ -z $HDFS_SECONDARYNAMENODE_USER ]; then
log "HDFS_SECONDARYNAMENODE_USER is not found,configuring..."
echo "export HDFS_SECONDARYNAMENODE_USER=root" >>$ENV
else
log "HDFS_SECONDARYNAMENODE_USER already existed,skipping..."
fi
if [ -z $YARN_RESOURCEMANAGER_USER ]; then
log "YARN_RESOURCEMANAGER_USER is not found,configuring..."
echo "export YARN_RESOURCEMANAGER_USER=root" >>$ENV
else
log "YARN_RESOURCEMANAGER_USER already existed,skipping..."
fi
if [ -z $YARN_NODEMANAGER_USER ]; then
log "YARN_NODEMANAGER_USER is not found,configuring..."
echo "export YARN_NODEMANAGER_USER=root" >>$ENV
else
log "YARN_NODEMANAGER_USER already existed,skipping..."
fi
log "Hadoop's environment was configured successfully"
source $ENV
hadoop version
#----------------------------------------------Hadoop end------------------------------------------------
#----------------------------------------------Scala start------------------------------------------------
log "Installing Scala..."
log "Configuring scala environment..."
if [ -z $SCALA_HOME ]; then
log "SCALA_HOME is not found,configuring..."
echo "export SCALA_HOME=$SCALA_DIR" >>$ENV
echo 'export PATH=$SCALA_HOME/bin:$PATH' >>$ENV
else
log "SCALA_HOME already existed,skipping..."
fi
log " Scala's environment was configured successfully"
source $ENV
scala -version
#----------------------------------------------Scala end------------------------------------------------
#----------------------------------------------Spark start------------------------------------------------
log "Installing Spark..."
log "Configuring slaves file..."
FILE_NAME="slaves"
SLAVES_FILE_TMP="$TEMPLATE_CONF_DIR""$FILE_NAME"
SLAVES_TEMPLATE="$SLAVES_FILE_TMP"".template"
cp $SLAVES_TEMPLATE $SLAVES_FILE_TMP
WORKERS_TMP=${WORKERS//,/\\n}
sed -i "s|WORKERS|$WORKERS_TMP|g" $SLAVES_FILE_TMP
SLAVES_FILE="$SPARK_DIR""/conf/""workers"
SLAVES_FILE_BAK="$SLAVES_FILE"".bak"
rm -rf $SLAVES_FILE_BAK
mv $SLAVES_FILE $SLAVES_FILE_BAK
cp $SLAVES_FILE_TMP $SLAVES_FILE
log "Configuring spark-config.sh..."
FILE_NAME="spark-config.sh"
SPARK_CONFIG_TMP="$TEMPLATE_CONF_DIR""$FILE_NAME"
SPARK_CONFIG_TEMPLATE="$SPARK_CONFIG_TMP"".template"
cp $SPARK_CONFIG_TEMPLATE $SPARK_CONFIG_TMP
sed -i "s|JDKHOME|$JDK_DIR|g" $SPARK_CONFIG_TMP
SPARK_CONFIG_FILE="$SPARK_DIR""/conf/""$FILE_NAME"
SPARK_CONFIG_BAK="$SPARK_CONFIG_FILE"".bak"
rm -rf $SPARK_CONFIG_BAK
mv $SPARK_CONFIG_FILE $SPARK_CONFIG_BAK
cp $SPARK_CONFIG_TMP $SPARK_CONFIG_FILE
log "Configuration files of Spark was configured successfully"
log "Configuring Spark environment..."
if [ -z $SPARK_HOME ]; then
log "SPARK_HOME is not found,configuring..."
echo "export SPARK_HOME=$SPARK_DIR" >>$ENV
echo 'export PATH=$PATH:$SPARK_HOME/bin' >>$ENV
else
log "SPARK_HOME already existed,skipping..."
fi
if [ -z $SPARK_MASTER_HOST ]; then
log "SPARK_MASTER_HOST is not found,configuring..."
echo "export SPARK_MASTER_HOST=$MASTER" >>$ENV
else
log "SPARK_MASTER_HOST already existed,skipping..."
fi
if [ -z $SPARK_WORKER_MEMORY ]; then
log "SPARK_WORKER_MEMORY is not found,configuring..."
echo 'export SPARK_WORKER_MEMORY=2g' >>$ENV
else
log "SPARK_WORKER_MEMORY already existed,skipping..."
fi
if [ -z $SPARK_WORKER_CORES ]; then
log "SPARK_WORKER_CORES is not found,configuring..."
echo 'export SPARK_WORKER_CORES=2' >>$ENV
else
log "SPARK_WORKER_CORES already existed,skipping..."
fi
if [ -z $SPARK_WORKER_INSTANCES ]; then
log "SPARK_WORKER_INSTANCES is not found,configuring..."
echo 'export SPARK_WORKER_INSTANCES=2' >>$ENV
else
log "SPARK_WORKER_INSTANCES already existed,skipping..."
fi
if [ -z "$SPARK_DAEMON_JAVA_OPTS" ]; then
log "SPARK_DAEMON_JAVA_OPTS is not found,configuring..."
CLUSTERS="$MASTER"":2181"
WORKER_ARR=(${WORKERS//,/ })
for item in ${WORKER_ARR[@]}; do
CLUSTERS="$CLUSTERS"",""$item"":2181"
done
echo 'export SPARK_DAEMON_JAVA_OPTS="-Dspark.deploy.recoveryMode=ZOOKEEPER -Dspark.deploy.zookeeper.url='"$CLUSTERS"' -Dspark.deploy.zookeeper.dir='"$ZK_DIR"'/data/"' >>$ENV
else
log "SPARK_DAEMON_JAVA_OPTS already existed,skipping..."
fi
#----------------------------------------------Spark end------------------------------------------------
#----------------------------------------------Zookeeper Start------------------------------------------------
log "Installing Zookeeper..."
log "Configuring zoo.cfg..."
FILE_NAME="zoo.cfg"
ZOOCFG_TMP="$TEMPLATE_CONF_DIR""$FILE_NAME"
ZOOCFG_TEMPLATE="$ZOOCFG_TMP"".template"
cp $ZOOCFG_TEMPLATE $ZOOCFG_TMP
WORKERS_ARR=(${WORKERS//,/ })
COUNT=1
ZK_CLUSTERS="server.1=""$MASTER"":2888:3888\n"
for item in ${WORKERS_ARR[@]}; do
COUNT=$[$COUNT+1]
ZK_CLUSTERS="$ZK_CLUSTERS""server\.$COUNT=$item\:2888\:3888\n"
done
sed -i "s|ZK_CLUSTERS|$ZK_CLUSTERS|g" $ZOOCFG_TMP
sed -i "s|CRTDIR|$CRTDIR|g" $ZOOCFG_TMP
ZOOCFG_FILE="$ZK_DIR""/conf/""$FILE_NAME"
ZOOCFG_FILE_BAK="$ZOOCFG_FILE"".bak"
rm -rf $ZOOCFG_FILE_BAK
mv $ZOOCFG_FILE $ZOOCFG_FILE_BAK
cp $ZOOCFG_TMP $ZOOCFG_FILE
echo "1" > $ZK_DIR/data/myid
log "Configuration files of Zookeeper was configured successfully"
#----------------------------------------------Zookeeper end------------------------------------------------
注意:zookeeper集群模式下还要配置一个myid文件,这个文件需要放在dataDir(在zoo.cfg中可以找到)目录下,文件里面只有一个值,对应server的编号(就是zoo.cfg文件中server.A=B:C:D中的A)在各自的机器上设置ServerID标识,
例如:192.168.36.128:2888的是1,192.168.36.129的是2,192.168.36.130的是3,
echo "1" > /home/sword/DC/zookeeper/data/myid
启动Zookeeper
在master,worker1,worker2启动zkServer.sh
./zkServer.sh start
查看zookeeper状态
./zkServer.sh status
ZooKeeper JMX enabled by default
Using config: /home/sword/DC/zookeeper/bin/../conf/zoo.cfg
Client port found: 2181. Client address: localhost.
Mode: leader
启动Hadoop
先初始化hadoop
hdfs namenode -format
然后进入到hadoop的安装路径的sbin目录启动hdfs
start-all.sh
浏览器中输入ip:50070测试启动的节点(启动可能会比较慢,需要等待才会出现两个)

启动Spark
进入spark的安装路径的sbin目录,启动master和slaves
./start-all.sh
在浏览器上输入http://192.168.36.128:8081/

附上Spark程序的maven pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.edata.bigdata</groupId>
<artifactId>edata-base</artifactId>
<packaging>pom</packaging>
<version>1.0-SNAPSHOT</version>
<properties>
<java.version>1.8</java.version>
<scala.version>2.12</scala.version>
<!-- <scala.binary.version>2.12</scala.binary.version>-->
<spark.version>3.3.1</spark.version>
<hadoop.version>3.1.2</hadoop.version>
<posgresql.version>42.1.1</posgresql.version>
<mongo.version>10.0.4</mongo.version>
<nebula.version>3.0.0</nebula.version>
<flink.version>1.14.3</flink.version>
<zookeeper.version>3.6.1</zookeeper.version>
<!-- <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> -->
</properties>
<modules>
<module>edata-base-component</module>
</modules>
<dependencies>
<!--Postgresql-->
<!-- <dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>42.3.1</version>
</dependency> -->
<!--mongodb-->
<dependency>
<groupId>org.mongodb.spark</groupId>
<artifactId>mongo-spark-connector</artifactId>
<version>${mongo.version}</version>
</dependency>
<!--Spark-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>com.vesoft</groupId>
<artifactId>nebula-spark-connector</artifactId>
<version>${nebula.version}</version>
</dependency>
<!--ZooKeeper-->
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>${zookeeper.version}</version>
</dependency>
</dependencies>
<build>
<!-- <sourceDirectory>src/main/scala</sourceDirectory>-->
<!-- <testSourceDirectory>src/test/scala</testSourceDirectory>-->
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
<encoding>UTF-8</encoding>
<showWarnings>true</showWarnings>
</configuration>
</plugin>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- <plugin>-->
<!-- <groupId>org.apache.maven.plugins</groupId>-->
<!-- <artifactId>maven-surefire-plugin</artifactId>-->
<!-- <version>2.19</version>-->
<!-- <configuration>-->
<!-- <skip>true</skip>-->
<!-- </configuration>-->
<!-- </plugin>-->
</plugins>
</build>
</project>
674

被折叠的 条评论
为什么被折叠?



