1. 下载jdk
apt-get install sun-java6-jdk
2. 设置JDK环境变量
# vi /etc/profile
在文件最后加上(永久有效):
JAVA_HOME=/usr/java/jdk1.6.0_27
CLASSPATH=.:$JAVA_HOME/lib.tools.jar
PATH=$JAVA_HOME/bin:$PATH
export JAVA_HOME CLASSPATH
export PATH
# source /etc/profile
3. 下载Hadoop并设置JDK
# gedit hadoop/conf/hadoop-env.sh
export JAVA_HOME=/usr/java/jdk1.6.0_27
4. Hadoop设置
编辑 /home/hadoop-0.20.2/conf/core-site.xml
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://localhost:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/tmp/hadoop/hadoop-${user.name}</value>
</property>
</configuration>
# sudo gedit /home/hadoop-0.20.2/conf/hdfs-site.xml
编辑 /home/hadoop-0.20.2/conf/hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
# sudo gedit /home/hadoop-0.20.2/conf/mapred-site.xml
编辑 /home/hadoop-0.20.2/conf/mapred-site.xml
<configuration>
<property>
<name>mapred.job.tracker</name>
<value>localhost:9001</value>
</property>
</configuration>
5. 设置hadoop环境变量
# vi /etc/profile
# hadoop
HADOOP_HOME=/home/hadoop-0.20.2
PATH=$HADOOP_HOME/bin:$PATH
export HADOOP_HOME PATH
# source /home/hadoop-0.20.2/conf/hadoop-env.sh
6. 格式化namenode
#hadoop namenode -format
7. 启动Hadoop
# bin/start-all.sh
8. 运行例子Wordcount
新建文本test1.txt: hello world
上传至hdfs:#hadoop fs -put test1.txt input
运行:hadoop jar hadoop-0.20.2-examples.jar wordcount input output
成功:
input output
12/02/05 19:54:45 INFO input.FileInputFormat: Total input paths to process : 1
12/02/05 19:54:45 INFO mapred.JobClient: Running job: job_201202051926_0001
12/02/05 19:54:46 INFO mapred.JobClient: map 0% reduce 0%
12/02/05 19:55:03 INFO mapred.JobClient: map 100% reduce 0%
12/02/05 19:55:15 INFO mapred.JobClient: map 100% reduce 100%
12/02/05 19:55:17 INFO mapred.JobClient: Job complete: job_201202051926_0001
12/02/05 19:55:17 INFO mapred.JobClient: Counters: 17
12/02/05 19:55:17 INFO mapred.JobClient: Job Counters
12/02/05 19:55:17 INFO mapred.JobClient: Launched reduce tasks=1
12/02/05 19:55:17 INFO mapred.JobClient: Launched map tasks=1
12/02/05 19:55:17 INFO mapred.JobClient: Data-local map tasks=1
12/02/05 19:55:17 INFO mapred.JobClient: FileSystemCounters
12/02/05 19:55:17 INFO mapred.JobClient: FILE_BYTES_READ=30
12/02/05 19:55:17 INFO mapred.JobClient: HDFS_BYTES_READ=12
12/02/05 19:55:17 INFO mapred.JobClient: FILE_BYTES_WRITTEN=92
12/02/05 19:55:17 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=16
12/02/05 19:55:17 INFO mapred.JobClient: Map-Reduce Framework
12/02/05 19:55:17 INFO mapred.JobClient: Reduce input groups=2
12/02/05 19:55:17 INFO mapred.JobClient: Combine output records=2
12/02/05 19:55:17 INFO mapred.JobClient: Map input records=1
12/02/05 19:55:17 INFO mapred.JobClient: Reduce shuffle bytes=30
12/02/05 19:55:17 INFO mapred.JobClient: Reduce output records=2
12/02/05 19:55:17 INFO mapred.JobClient: Spilled Records=4
12/02/05 19:55:17 INFO mapred.JobClient: Map output bytes=20
12/02/05 19:55:17 INFO mapred.JobClient: Combine input records=2
12/02/05 19:55:17 INFO mapred.JobClient: Map output records=2
12/02/05 19:55:17 INFO mapred.JobClient: Reduce input records=2
PS:若已经运行过,会出错org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory output already exists
解决办法:删掉output重新运行
# hadoop fs -rmr output