软件准备
使用Windows搭建单机开发环境
-
安装JDK注意安装目录中不要有中文和空格,注意默认安装目录中Program Files有空格),配置JAVA_HOME
-
首先根据需要下载hadoop版本,首先在Windows系统里打开浏览器,下载hadoop的安装包(二进制文件):http://hadoop.apache.org/releases.html
-
直接解压(注意解压目录中不要有中文和空格)
-
配置环境变量HADOOP_HOME
-
配置系统变量Path,添加%HADOOP_HOME%\bin
-
至此HADOOP单机开发环境完成,这时可以进行简单mapreduce程序开发
-
写一个简单的wordcount测试一下,可以运行证明安装成功
-
详细代码,新建一个maven工程,根据安装Hadoop版本导入依赖,新建三个java文件WordcountMapper、WordCountReducer、WordCountDriver
1.pom.xml<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.zys</groupId> <artifactId>hadoop2-init</artifactId> <version>1.0-SNAPSHOT</version> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>RELEASE</version> </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-core</artifactId> <version>2.8.2</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.7.2</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.7.2</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.7.2</version> </dependency> </dependencies> <build> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>2.3.2</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> <plugin> <artifactId>maven-assembly-plugin </artifactId> <configuration> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> <archive> <manifest> <mainClass>com.zys.wordcount.WordCountDriver</mainClass> </manifest> </archive> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build>
2.WordcountMapper.java
package com.zys.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @author zhengyunshuo
* @date 2020/10/19 - 12:26
*/
/**
* map阶段
* KEYIN 输入数据的key
* VALUEIN 输入数据的value
* KEYOUT 输出数据的类型 aa,1 bb,1
* VALUEOUT 输出数据的类型
*/
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
// 防止在for循环中不断创造对象
Text k = new Text();
IntWritable v = new IntWritable(1);
/**
* 从文件中读取一行进行Map操作
* @param key
* @param value
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1.获取一行数据
String line = value.toString();
// 2.切割单词
String[] words = line.split(" ");
// 3.循环写出
for(String word : words){
// Text k = new Text();
k.set(word);
// IntWritable v = new IntWritable()
// v.set(1); // 不写默认为1
context.write(k,v);
}
}
}
3.WordCountReducer.java
package com.zys.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author zhengyunshuo
* @date 2020/10/19 - 12:42
*/
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
// 累加求和
for(IntWritable value : values){
sum += value.get();
}
v.set(sum);
// 写出
context.write(key,v);
}
}
-
WordCountDriver.java
package com.zys.wordcount; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; /** * @author zhengyunshuo * @date 2020/10/19 - 13:59 */ public class WordCountDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); // 1.获取Job对象 Job job = Job.getInstance(conf); // 2.设置Jar存储位置 job.setJarByClass(WordCountDriver.class); // 3.关联Map和Reduce类 job.setMapperClass(WordcountMapper.class); job.setReducerClass(WordCountReducer.class); // 4.设置Mapper阶段输出数据的key和value类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); // 5.设置最终数据输出的key和value类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // 6.设置输入路径和输出路径 String input = "E:\\Workspace\\ideaworksapce\\wordcount\\src\\data"; String output = "E:\\Workspace\\ideaworksapce\\wordcount\\src\\data\\output"; FileInputFormat.setInputPaths(job,new Path(input)); FileOutputFormat.setOutputPath(job,new Path(output)); // 7.提交job // job.submit(); // 仅提交 boolean res = job.waitForCompletion(true); // 除提交外还有日志记录 System.exit(res?0:1); } }
使用 Windows搭建伪分布式环境
按照上边的配置虽然可以编写mapreduce程序,但是如果要测试hdfs的API,比方说上传下载还是不行,接下来就是配置HDFS的文件,进行伪分布式集群配置。(备注:伪分布式集群也是分布式集群,可以起动分布式计算的效果)
-
修改配置文件
我们来到之前解压的hadoop文件夹下,打开etc/hadoop文件夹,修改如下几个配置文件
1.修改hadoop-env.cmd,在文件末尾添加如下配置
set HADOOP_PREFIX=%HADOOP_HOME% set HADOOP_CONF_DIR=%HADOOP_PREFIX%\etc\hadoop set YARN_CONF_DIR=%HADOOP_CONF_DIR% set PATH=%PATH%;%HADOOP_PREFIX%\bin
2.修改core-site.xml,修改
<configuration> <property> <name>fs.defaultFS</name> <value>hdfs://0.0.0.0:9000</value> </property> </configuration>
3.修改hdfs-site.xml
<configuration> <property> <name>dfs.replication</name> <value>1</value> </property> <property> <name>dfs.name.dir</name> <value>file:///D:/hadoop/hadoop-3.1.4/local/dfs/name</value> </property> <property> <name>dfs.data.dir</name> <value>file:///D:/hadoop/hadoop-3.1.4/local/dfs/data</value> </property> </configuration>
4.mapred-site.xml %USERNAME%更换为自己的用户名
<configuration> <property> <name>mapreduce.job.user.name</name> <value>%USERNAME%</value> </property> <property> <name>mapreduce.framework.name</name> <value>yarn</value> </property> <property> <name>yarn.apps.stagingDir</name> <value>/user/%USERNAME%/staging</value> </property> <property> <name>mapreduce.jobtracker.address</name> <value>local</value> </property> </configuration>
4.yarn-site.xml
<configuration> <property> <name>yarn.server.resourcemanager.address</name> <value>0.0.0.0:8020</value> </property> <property> <name>yarn.server.resourcemanager.application.expiry.interval</name> <value>60000</value> </property> <property> <name>yarn.server.nodemanager.address</name> <value>0.0.0.0:45454</value> </property> <property> <name>yarn.nodemanager.aux-services</name> <value>mapreduce_shuffle</value> </property> <property> <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name> <value>org.apache.hadoop.mapred.ShuffleHandler</value> </property> <property> <name>yarn.server.nodemanager.remote-app-log-dir</name> <value>/app-logs</value> </property> <property> <name>yarn.nodemanager.log-dirs</name> <value>/dep/logs/userlogs</value> </property> <property> <name>yarn.server.mapreduce-appmanager.attempt-listener.bindAddress</name> <value>0.0.0.0</value> </property> <property> <name>yarn.server.mapreduce-appmanager.client-service.bindAddress</name> <value>0.0.0.0</value> </property> <property> <name>yarn.log-aggregation-enable</name> <value>true</value> </property> <property> <name>yarn.log-aggregation.retain-seconds</name> <value>-1</value> </property> <property> <name>yarn.application.classpath</name> <value>%HADOOP_CONF_DIR%,%HADOOP_COMMON_HOME%/share/hadoop/common/*,%HADOOP_COMMON_HOME%/share/hadoop/common/lib/*,%HADOOP_HDFS_HOME%/share/hadoop/hdfs/*,%HADOOP_HDFS_HOME%/share/hadoop/hdfs/lib/*,%HADOOP_MAPRED_HOME%/share/hadoop/mapreduce/*,%HADOOP_MAPRED_HOME%/share/hadoop/mapreduce/lib/*,%HADOOP_YARN_HOME%/share/hadoop/yarn/*,%HADOOP_YARN_HOME%/share/hadoop/yarn/lib/*</value> </property> </configuration>
-
安装winutills
由于windows下想要开启集群,会有一定的bug,因此我们去网站:https://github.com/steveloughran/winutils
下载对应版本的winutils.exe文件,将对应版本下的bin目录下的内容拷贝到hadoop解压目录的etc目录下
-
配置hadoop.dll
将上步的hadoop.dll放到C:\Windows\System32下,重启电脑
-
初始化环境变量
windows下的win+r中输入cmd,输入命令
%HADOOP_HOME%\etc\hadoop\hadoop-env.cmd
-
格式化namenode,输入命令
hadoop namenode -format
- 启动集群,控制台输入命令%HADOOP_HOME%/sbin/start-all.cmd
- 通过jps查看集群情况,如图为正常
-
打开浏览器验证
我们在浏览器输入localhost:50070,如果能够打开这样的网页,说明hadoop已经成功开启:
-
编写代码,尝试在hdfs上创建目录
在上述工程中新建一个HDFSClient.java文件,代码如下:package com.zys.hdfs; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import java.io.IOException; /** * @author zhengyunshuo * @date 2021/1/4 - 10:47 */ public class HDFSClient { public static void main(String args[]) throws IOException { // 1.获取hdfs客户端对象 Configuration conf = new Configuration(); conf.set("fs.defaultFS","hdfs://127.0.0.1:9000"); FileSystem fs = FileSystem.get(conf); // 2.在hdfs上创建路径 fs.mkdirs(new Path("/zys/test")); //3.关闭资源 fs.close(); System.out.println("Compile Over"); } }
可以看到目录创建成功,证明没有问题,接下来就可以进行愉快的开发了