大数据开发学习平台安装配置

本文详细介绍了大数据集群的搭建过程,包括Hadoop、Spark、HBase等组件的配置及启动方法,并提供了各种配置文件的具体设置。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

本文中直接跳过服务器之间免密码登录以及相关账户的创建和权限配置的工作。
相关操作请移步 免密码 登录,linux账户的添加和配置。

软件版本

jdk
jdk-8u91-linux-x64.tar.gz

scala
scala-2.10.6.tgz

hadoop
hadoop-2.6.4.tar.gz
hadoop-2.6.4-src.tar.gz 源代码

mahout
apache-mahout-distribution-0.12.2.tar.gz

hive
apache-hive-2.1.0-bin.tar.gz
mysql-connector-java-5.1.39.tar.gz 数据库JDBC驱动

spark
spark-1.6.2-bin-hadoop2.6.tgz 选版本对应的
spark-1.6.2.tgz 源代码

hbase
zookeeper-3.4.8.tar.gz 提前装好zookeeper
hbase-1.2.2-bin.tar.gz

storm
apache-storm-1.0.2.tar.gz
apache-storm-1.0.2-src.tar.gz 源代码

sqoop
sqoop-1.99.6-bin-hadoop200.tar.gz

集群的三种模式

local(单机)

standalone

yarn


配置清单

JDK

profile

export JAVA_HOME=/usr/local/jdk1.8.0_91
export JRE_HOME=$JAVA_HOME/jre
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export PATH=$PATH:$JAVA_HOME/bin:$JRE_HOME/bin

scala

profile

# scala
export SCALA_HOME=/usr/local/scala-2.10.6
export PATH=$PATH:$SCALA_HOME/bin

Hadoop

profile

# hadoop
export HADOOP_HOME=/usr/local/hadoop-2.6.4
export HADOOP_PREFIX=$HADOOP_HOME
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin

*.env

$HADOOP_HOME/etc/hadoop/hadoop-env.sh, $HADOOP_HOME/etc/hadoop/mapred-env.sh, $HADOOP_HOME/etc/hadoop/yarn-env.sh

source ~/.bash_profile

# 对于 yarn-env.sh 只需要配置 JAVA_HOME 即可,否则会出现找不到ResourceManager、NodeManager类的问题
# Error: Could not find or load main class org.apache.hadoop.yarn.server.resourcemanager.ResourceManager
# Error: Could not find or load main class org.apache.hadoop.yarn.server.nodemanager.NodeManager

slaves

$HADOOP_HOME/etc/hadoop/slaves

slave1
slave2

core-site

$HADOOP_HOME/etc/hadoop/core-site.xml

<configuration>
    <property>
            <name>hadoop.tmp.dir</name>
            <value>/home/hadoop/tmp</value>
            <description>A base for other temporary directories.</description>
    </property>

    <property>
            <name>fs.default.name</name>
            <value>hdfs://master:9000</value>
            <final>true</final>
            <description>The name of the default file system.  A URI whose
            scheme and authority determine the FileSystem implementation.  The
            uri's scheme determines the config property (fs.SCHEME.impl) naming
            the FileSystem implementation class.  The uri's authority is used to
            determine the host, port, etc. for a filesystem.</description>
    </property>
</configuration>

hdfs-site

$HADOOP_HOME/etc/hadoop/hdfs-site.xml

<configuration>
  <property>
    <name>dfs.datanode.ipc.address</name>
    <value>0.0.0.0:50020</value>
  </property>
  <property>
    <name>dfs.datanode.http.address</name>
    <value>0.0.0.0:50075</value>
  </property>
  <property>
    <name>dfs.replication</name>
    <value>2</value>
  </property>
</configuration>

mapred-site

$HADOOP_HOME/etc/hadoop/mapred-site.xml

<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>

    <property>
        <name>mapreduce.jobhistory.address</name>
        <value>master:10020</value>
    </property>

    <property>
        <name>mapreduce.jobhistory.webapp.address</name>
        <value>master:19888</value>
    </property>
</configuration>

yarn-site

$HADOOP_HOME/etc/hadoop/yarn-site.xml

<configuration>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>

    <property>
        <name>yarn.resourcemanager.address</name>
        <value>master:8032</value>
    </property>

    <property>
        <name>yarn.resourcemanager.scheduler.address</name>
        <value>master:8030</value>
    </property>

    <property>
        <name>yarn.resourcemanager.resource-tracker.address</name>
        <value>master:8031</value>
    </property>
</configuration>

启动HDFS

# 初始化 hdfs
$HADOOP_HOME/bin/hdfs namenode -format

# 启动 hdfs
$HADOOP_HOME/sbin/start-dfs.sh

# 启动 yarn管理集群
$HADOOP_HOME/sbin/start-yarn.sh

# jobhistory启动
$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver

测试

$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.6.4.jar wordcount <INPUT> <OUTPUT>

端口

端口号描述
8088All Applications(yarn-web-ui)
9000hdfs端口
50070Namenode information(web-ui)
50090SecondaryNamenode information(web-ui)
19888JobHistory(web-ui)

pig

profile

# pig
export PIG_HOME=/usr/local/pig-0.16.0
export PIG_CLASS=$HADOOP_HOME/etc/hadoop
export PATH=$PATH:$PIG_HOME/bin

spark

profile

# spark
export SPARK_HOME=/usr/local/spark-1.6.2-bin-hadoop2.6
export PATH=$PATH:$SPARK_HOME/bin

spark-env

$SPARK_HOME/conf/spark-env.sh

# local, standalone, yarn模式
source ~/.bash_profile
export SPARK_MASTER_IP=master

# jobhistroy
export SPARK_HISTORY_OPTS="-Dspark.history.ui.port=7777 -Dspark.history.retainedApplications=2 -Dspark.history.fs.logDirectory=hdfs://master:9000/sparklog"

# 其它配置
export SPARK_WORKER_CORES=2
export SPARK_WORKER_MEMORY=1G

slaves

$SPARK_HOME/conf/slaves

slave1
slave2

spark-default.xml

# jobhistory配置
spark.eventLog.enabled  true
spark.eventLog.dir      hdfs://master:9000/sparklog
spark.eventLog.compress true

启动spark

# 启动计算框架集群
$SPARK_HOME/sbin/start-all.sh

# 启动HistoryServer
$SPARK_HOME/sbin/start-history-server.sh

测试

# 1
$SPARK_HOME/bin/spark-submit --class org.apache.spark.examples.SparkPi $SPARK_HOME/lib/examples-1.6.2-hadoop2.6.0.jar

# 2
MASTER=local && $SPARK_HOME/bin/run-example SparkPi

端口

端口号描述
4040App(web-ui)
7077Master
7777History(web-ui)

zookeeper

profile

# zookeeper
export ZOOKEEPER_HOME=/usr/local/zookeeper-3.4.8
export PATH=$PATH:$ZOOKEEPER_HOME/bin

zoo.cfg

# The number of milliseconds of each tick
tickTime=2000
# The number of ticks that the initial 
# synchronization phase can take
initLimit=10
# The number of ticks that can pass between 
# sending a request and getting an acknowledgement
syncLimit=5
# the directory where the snapshot is stored.
dataDir=/home/zookeeper/data
# dataLogDir=/home/zookeeper/logs
# the port at which the clients will connect
clientPort=2181

server.1=master:2888:3888
server.2=slave1:2888:3888
server.3=slave2:2888:3888

myid

创建 myid 文件 /home/zookeeper/data/myid

# 不同的机器需要分配不同的 id,序号与上述 zoo.cfg 中的 server.* 中的数字对应
echo 1 >> /home/zookeeper/data/myid

启动

$ZOOKEEPER_HOME/bin/zkServer.sh start

端口

端口号描述
2181Zookeeper-Client
2888from
3888to

hbase

profile

# hbase
export HBASE_HOME=/usr/local/hbase-1.2.2
export PATH=$PATH:$HBASE_HOME/bin

hbase-env

$HBASE_HOME/conf/hbase-env.sh

source ~/.bash_profile
# export JAVA_HOME HADOOP_HOME HBASE_HOME
export HBASE_CLASSPATH=$HADOOP_HOME/etc/hadoop
export HBASE_MANAGES_ZK=true
export HBASE_LOG_DIR=$HBASE_HOME/logs

regionservers

slave1
slave2

hbase-site.xml

$HBASE_HOME/conf/hbase-site.xml

<configuration>
  <property>
    <name>hbase.master</name>
    <value>master:6000</value>
  </property>
  <property>
    <name>hbase.master.maxclockskew</name>
    <value>180000</value>
  </property>
  <property>
    <name>hbase.rootdir</name>
    <value>hdfs://master:9000/hbase</value>
  </property>
  <property>
    <name>hbase.cluster.distributed</name>
    <value>true</value>
  </property>
  <property>
    <name>hbase.zookeeper.quorum</name>
    <value>master,slave1,slave2</value>
  </property>
  <property>
    <name>hbase.zookeeper.property.dataDir</name>
    <value>/home/zookeeper/data</value>
  </property>
  <property>
    <name>dfs.replication</name>
    <value>1</value>
  </property>
</configuration>

启动HBase

$HBASE_HOME/bin/start-hbase.sh

测试

$HBASE_HOME/bin/hbase-shell

端口

端口号描述
16010HBase(web-ui)

hive

profile

# hive
export HIVE_HOME=/usr/local/apache-hive-2.1.0-bin
export PATH=$PATH:$HIVE_HOME/bin

hive-env.sh

$HIVE_HOME/conf/hive-env.sh

source ~/.bash_profile
export HIVE_CONF_DIR=$HIVE_HOME/conf

hive-site.xml

$HIVE_HOME/conf/hive-site.xml

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
  <property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>com.mysql.jdbc.Driver</value>
    <description>Driver class name for a JDBC metastore</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionURL</name>
    <value>jdbc:mysql://master:3306/hive?createDatabaseIfNotExist=true</value>
    <description>JDBC connect string for a JDBC metastore</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>hive</value>
    <description>username to use against metastore database</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>hive</value>
    <description>password to use against metastore database</description>
  </property>
  <property>
    <name>hive.metastore.warehouse.dir</name>
    <value>hdfs://master:9000/user/hive/warehouse</value>
  </property>
    <property>
        <name>hive.hwi.listen.host</name>
        <value>0.0.0.0</value>
    </property>
    <property>
        <name>hive.hwi.listen.port</name>
        <value>9999</value>
    </property>
    <property>
        <name>hive.hwi.war.file</name>
        <value>lib/hive-hwi-2.1.0.war</value>
    </property>
</configuration>

初始化

测试

$HIVE_HOME/bin/hive

sqoop

profile

# sqoop
export SQOOP_HOME=/usr/local/sqoop-1.99.6-bin-hadoop200
export PATH=$PATH:$SQOOP_HOME/bin
export CATALINE_BASE=$SQOOP_HOME/server
export LOGDIR=$SQOOP_HOME/logs

测试

# 启动
$SQOOP_HOME/bin/sqoop2-server start

# cli
$SQOOP_HOME/bin/sqoop2-shell

mahout

storm

第三方包管理工具

maven

下载
配置 path 路径即可

sbt

Homebrew (Third-party package)

$ brew install sbt

Macports (Third-party package)

$ port install sbt

下载

未完待续...

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值