hadoop原生版安装部署---3.hdfs

本文详细介绍Hadoop集群的搭建过程,包括环境配置、服务启动及验证等步骤,并针对Snappy压缩问题提供解决方案。

1.下载安装

 tar xzvf hadoop-2.2.0.tar.gz -C ../
 mv hadoop-2.2.0/ hadoop/

2.bash_profile

su - hadoop
  export HADOOP_PREFIX="/home/hadoop/hadoop"
  export HADOOP_MAPRED_HOME=$HADOOP_PREFIX
  export HADOOP_COMMON_HOME=$HADOOP_PREFIX
  export HADOOP_HOME=$HADOOP_PREFIX
  export HADOOP_HDFS_HOME=$HADOOP_PREFIX
  export HADOOP_CONF_DIR=$HADOOP_PREFIX/etc/hadoop
  export YARN_CONF_DIR=$HADOOP_PREFIX/etc/hadoop
  export YARN_HOME=$HADOOP_PREFIX
  export PATH=$PATH:$HADOOP_PREFIX/bin:$HADOOP_PREFIX/sbin

  source ~/.bash_profile

3.hadoop-env.sh
vi /home/hadoop/hadoop/etc/hadoop/hadoop-env.sh

# The java implementation to use.
export JAVA_HOME=/usr/local/jdk1.6.0_45
export HADOOP_HOME="/home/hadoop/hadoop"

export JAVA_LIBRARY_PATH="/usr/lib:/usr/local/lib:$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native"

#测试机器8G 这里的XMS与XMX指的是JAVA虚拟机内存分配策略中的最小可用内存和最大内存设置。
export HADOOP_NAMENODE_OPTS="-server -Xmx2G -Xms2G -Xmn1G -XX:MaxPermSize=512m -XX:PermSize=512m -XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
export HADOOP_DATANODE_OPTS="-server -Xmx1G -Xms1G -Xmn720M -XX:MaxPermSize=512m -XX:PermSize=512m -XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
export HADOOP_SECONDARYNAMENODE_OPTS="-server -Xmx2G -Xms2G -Xmn1G -XX:MaxPermSize=512m -XX:PermSize=512m -XX:+UseParNewGC -XX:+UseConcMarkSweepGC "

export HADOOP_PID_DIR=${HADOOP_PREFIX}/pids

4.yarn-env.sh

YARN_HEAPSIZE=512  #默认值1000

   #指定rs启动参数
   #测试机器
   export YARN_RESOURCEMANAGER_OPTS="-server -Xmx2G -Xms2G -Xmn1G -XX:MaxPermSize=512m -XX:PermSize=512m -XX:+UseParNewGC -XX:+UseConcMarkSweepGC "
   export YARN_NODEMANAGER_OPTS="-server -Xmx1G -Xms1G -Xmn640M -XX:MaxPermSize=360m -XX:PermSize=360m -XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:-UseGCOverheadLimit"

   export YARN_PID_DIR=${HADOOP_PREFIX}/pids

5.core-site.xml

<configuration>

    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://bvdata</value>
    </property>

    <property>
        <name>hadoop.tmp.dir</name>
        <value>/home/hadoop/hadoop/tmpdir</value>
    </property>

    <property>
      <name>fs.trash.interval</name>
      <value>1440</value>
      <description>Number of minutes between trash checkpoints. If zero, the trash feature is disabled.</description>
    </property>

    <property>
        <name>io.compression.codecs</name>
    <value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.SnappyCodec</value>
    </property>
</configuration>

6.hdfs-site.xml

<configuration>
  <property>
      <name>dfs.namenode.name.dir</name>
      <value>file:/home/hadoop/hadoop/data/hadoop/dfs/name</value>
      <description>namenode data dir</description>
       <final>true</final>
  </property>

  <property>
        <name>dfs.datanode.data.dir</name>
        <value>file:/home/hadoop/hadoop/data/hadoop/dfs/data</value>
        <description>
            datanode data dir
        </description>
        <final>true</final>
  </property> 

  <property>
        <name>dfs.namenode.checkpoint.dir</name>
        <value>file:/home/hadoop/hadoop/data/hadoop/namesecondary</value>
        <description>
            secondary namenode data dir
        </description>
        <final>true</final>
   </property>

    <!--指定hdfs的nameservice为bvdata,需要和core-site.xml中的保持一致 --> 
    <property>
        <name>dfs.nameservices</name>
        <value>bvdata</value>
        <description>提供服务的NS逻辑名称,与core-site.xml里的对应</description>
    </property>

    <property>
        <name>dfs.ha.namenodes.bvdata</name>
        <value>c9test91,c9test92</value>
    </property>

    <property>
        <name>dfs.namenode.rpc-address.bvdata.c9test91</name>
        <value>c9test91:9000</value>
    </property>

    <property>
        <name>dfs.namenode.http-address.bvdata.c9test91</name>
        <value>c9test91:50070</value>
    </property>

    <property>
        <name>dfs.namenode.rpc-address.bvdata.c9test92</name>
        <value>c9test92:9000</value>
    </property>

    <property>
        <name>dfs.namenode.http-address.bvdata.c9test92</name>
        <value>c9test92:50070</value>
    </property>

    <property>
        <name>dfs.journalnode.edits.dir</name>
        <value>/home/hadoop/hadoop/data/hadoop/journal</value>
    </property>

    <property>
        <name>dfs.namenode.shared.edits.dir</name>
        <value>qjournal://c9test91:8485;c9test92:8485;c9test93:8485/bvdata</value>
        <description>
                如何启动JournalNode:在JournalNode的各个节点上部署一份Hadoop代码,在hdfs-site.xml中添加以下配置,设置数据存放目录(注意,只能配置一个目录):
                    然后执行“bin/hdfs-daemon.sh start journalnode”,启动JournalNode服务。
        </description>
    </property>

    <property>
        <name>ha.zookeeper.quorum</name>
        <value>c9test91:2181,c9test92:2181,c9test93:2181</value>
        <description>指定用于HA的ZooKeeper集群机器列表</description>
    </property>

    <property>
        <name>dfs.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>

    <property>
        <name>dfs.journalnode.rpc-address</name>
        <value>0.0.0.0:8485</value>
        <description>journalnode的rpc地址</description>
    </property>

    <property>
        <name>dfs.client.failover.proxy.provider.bvdata</name>
        <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
        <description>指定hdfs client来识别bvdata命名空间并在namenode切换期间识别namnode的proxy类</description>
    </property>

    <property>
        <name>dfs.ha.fencing.methods</name>
        <value>sshfence</value>
    </property>

    <property>
        <name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/home/hadoop/.ssh/id_rsa</value>
    </property>

    <property>
        <name>ha.health-monitor.rpc-timeout.ms</name>
        <value>90000</value>
    </property>

    <property>
        <name>ha.failover-controller.cli-check.rpc-timeout.ms</name>
        <value>60000</value>
    </property>

    <property>
        <name>ipc.client.connect.timeout</name>
        <value>60000</value>
    </property>

    <property>
        <name>dfs.client.read.shortcircuit.buffer.size</name>
        <value>4096</value>
    </property>

    <property>
        <name>dfs.image.transfer.bandwidthPerSec</name>
        <value>4194304</value>
    </property>

    <property>  
      <name>dfs.hosts.exclude</name>  
      <value>/home/hadoop/hadoop/etc/hadoop/excludes</value>
    </property>

    <property>
        <name>dfs.replication</name>
        <value>3</value>
    </property> 

    <property>
        <name>dfs.permissions</name>
        <value>false</value>
    </property> 

    <property>
        <name>dfs.support.append</name>
        <value>true</value>
    </property>

    <property>
        <name>dfs.webhdfs.enabled</name>
        <value>true</value>
    </property>

    <property>
        <name>dfs.datanode.max.xcievers</name>
        <value>4096</value>
    </property>

    <property>
        <name>dfs.qjournal.write-txns.timeout.ms</name>
        <value>600000000</value>
    </property>
</configuration>

7.mapred-site.xml

<property>
         <name>mapreduce.framework.name</name>
         <value>yarn</value>
    </property> 

    <property>
         <name>mapreduce.map.output.compress</name>
         <value>true</value>
    </property>

    <property>
         <name>mapreduce.map.output.compress.codec</name>
         <value>org.apache.hadoop.io.compress.SnappyCodec</value>
    </property>

    <!-- mr目录在datanode上 /home/hadoop/hadoop/data/hadoop/mapred/system   -->
    <property>
        <name>mapred.system.dir</name>
        <value>file:/home/hadoop/hadoop/data/hadoop/mapred/system</value>
        <final>true</final>
    </property> 

    <!-- mr目录在datanode上 /home/hadoop/hadoop/data/hadoop/mapred/local   -->
    <property>
        <name>mapred.local.dir</name>
        <value>file:/home/hadoop/hadoop/data/hadoop/mapred/local</value>
        <final>true</final>
    </property>

    <property>
        <name>mapreduce.map.memory.mb</name>
        <value>1024</value>
        <description>每个MapReduce作业的map任务可以申请的内存资源数量</description>
    </property>

    <property>
        <name>mapreduce.map.cpu.vcores</name>
        <value>1</value>
        <description>每个MapReduce作业的map任务可以申请的虚拟CPU资源的数量</description>
    </property>

    <property>
        <name>mapreduce.reduce.memory.mb</name>
        <value>1024</value>
        <description>每个MapReduce作业的reduce任务可以申请的内存资源数量</description>
    </property>

8.yarn-site.xml

<property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
        <description>shuffle service that needs to be set for Map Reduce to run</description>
    </property>

    <property>
        <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
        <value>org.apache.hadoop.mapred.ShuffleHandler</value>
        <description>新框架中 NodeManager 与 RM 通信的接口class</description>
    </property>

    <property>
        <name>yarn.resourcemanager.webapp.address</name>
        <value>c9test93:8088</value>
        <description>新框架中各个 task 的资源调度及运行状况通过通过该 web 界面访问</description>
    </property>

    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>c9test93</value>
    </property>


    <property>
        <name>yarn.resourcemanager.resource-tracker.address</name>
        <value>c9test93:8031</value>
    </property>

    <property>
        <name>yarn.nodemanager.resource.memory-mb</name>
    <value>4096</value>
    <description>
    Hadoop YARN可以使用的最大内存量,用来控制每个Node上能运行MapReduce的数量。默认8GB
    </description>
    </property>

    <property>
        <name>yarn.nodemanager.resource.cpu-vcores</name>
    <value>4</value>
    <description>
    Hadoop 2.x以上提供内存和CPU两种资源技术方式,该参数控制Node上的CPU数量。默认是8
    </description>
    </property>

    <property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
    </property>

9.slaves

   c9test93
   c9test94
   #这里需要说明:在91上启动hdfs,这里的slaves指的是datanode ,在93上启动yarn(resourcemanager),这里的slaves是指的nodemanager

10.cp文件到所有机器

  scp -r /home/hadoop/hadoop/ c9test92:/home/hadoop/
  scp -r /home/hadoop/hadoop/ c9test93:/home/hadoop/
  scp -r /home/hadoop/hadoop/ c9test94:/home/hadoop/
  scp ~/.bash_profile c9test92:~/
  scp ~/.bash_profile c9test93:~/
  scp ~/.bash_profile c9test94:~/

11.目录规划创建

#91:namenode journalnode
/home/hadoop/hadoop/tmpdir
/home/hadoop/hadoop/pids
/home/hadoop/hadoop/data/hadoop/dfs/name  
/home/hadoop/hadoop/data/hadoop/namesecondary  
/home/hadoop/hadoop/data/hadoop/journal 
echo > /home/hadop/hadoop/etc/hadoop/excludes

#92:namenode journalnode
/home/hadoop/hadoop/tmpdir
/home/hadoop/hadoop/pids
/home/hadoop/hadoop/data/hadoop/dfs/name 
/home/hadoop/hadoop/data/hadoop/namesecondary  
/home/hadoop/hadoop/data/hadoop/journal 
echo > /home/hadop/hadoop/etc/hadoop/excludes 

#93:datanode resourcemanager jouralnode nodemanager
/home/hadoop/hadoop/tmpdir
/home/hadoop/hadoop/pids
/home/hadoop/hadoop/data/hadoop/dfs/data 
/home/hadoop/hadoop/data/hadoop/mapred/system  
/home/hadoop/hadoop/data/hadoop/mapred/local  
/home/hadoop/hadoop/data/hadoop/journal  
echo > /home/hadop/hadoop/etc/hadoop/excludes 

#94:datanode  nodemanager
/home/hadoop/hadoop/tmpdir
/home/hadoop/hadoop/pids
/home/hadoop/hadoop/data/hadoop/dfs/data  
/home/hadoop/hadoop/data/hadoop/mapred/system  
/home/hadoop/hadoop/data/hadoop/mapred/local  
/home/hadoop/hadoop/data/hadoop/journal 
echo > /home/hadop/hadoop/etc/hadoop/excludes

12.初始化及启动服务
12.1 启动zookeeper集群

zkServer.sh start  #或者用自己写的zkrun.sh 注意启动后查看status

12.2 格式化HDFS

#首先在919293的journalnode上单独启动journalnode,注意不要用hadoop-daemons.sh
sbin/hadoop-daemon.sh start journalnode    #jps可以看到多了JournalNode进程

#namenode91上执行:
hdfs namenode -format 

#将91执行完毕后/home/hadoop/hadoop/data/hadoop/dfs/name   scp到92上
(在92上备NN上同步主NN的元数据信息 hdfs namenode -bootstrapStandby 注意用此命令需要91的NN启动状态)

12.3 格式化ZK

hdfs zkfc -formatZK 

12.4 hadoop启动
hadoop启动有几种方式:
1)方法一:zk—–dfs—-yarn

 91:zkrun.sh start
   91:start-dfs.sh 
   93:start-yarn.sh
   关闭:
   93:stop-yarn.sh
   91:stop-dfs.sh
   91:zkrun.sh stop 

2)方法二(生产系统,为了详细看哪步出错):

启动:
     (1)zk
     (2)hadoop-daemon.sh start journalnode (91 92 93)
     (3)hadoop-daemon.sh start namenode(91 92)
     (4)hadoop-daemon.sh start datanode(93 94)
      (5) yarn-daemon.sh start resourcemanager(93)
      (6) yarn-daemon.sh start nodemanager(93 94)
      (7) hadoop-daemon.sh start zkfc(91 92)
      (8) habse启动hmaster(主 备) start-hbase.sh
      (9) hbase启动HRegionServer hbase-daemon.sh start master

   关闭:
      (1) hbase关闭HRegionServer hbase-daemon.sh stop master
      (2) habse关闭hmaster(主 备) stop-hbase.sh
      (3) hadoop-daemon.sh stop zkfc(91 92)
      (4) yarn-daemon.sh stop nodemanager(93 94)
      (5) yarn-daemon.sh stop resourcemanager(93)
      (6) hadoop-daemon.sh stop datanode(93 94)
      (7) hadoop-daemon.sh stop namenode(91 92)
     (8)hadoop-daemon.sh stop journalnode (91 92 93)
      (9) zk

12.5 验证

#1)web验证
  #namenode:
  http://c9test91:50070  http://c9test92:50070 #kill掉一个看另外一个的active状态
  #ResourceManager:
 http://c9test93:8088

 #2)上传文件
   hdfs dfs -put jdk-6u45-linux-x64.bin /jdk.bin 
   hdfs dfs -ls /

  #3)MR测试
    创建一个文件word.txt如 ABC bdc ABC dfe def def等
    上传到hdfs目录 hdfs dfs -put word.txt /word
    执行MR
    hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.2.0.jar wordcount /word /out
    #注意这里因为我们配置了mr的snappy压缩,导致这里直接使用wordcount会报错,下一篇文章会介绍snappy安装和部署,安装后执行MR会成功

13.snappy问题
因为core-site.xml中io.compression.codecs配置了org.apache.hadoop.io.compress.SnappyCodec
mapred-site.xml中的mapreduce.map.output.compress配置了true
mapred-site.xml中的mapreduce.map.output.compress.codec配置了org.apache.hadoop.io.compress.SnappyCodec
也就是说明在执行MR的时候是用snappy压缩的,因此需要单独在系统中安装snappy

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值