一、运行hadoop提供的wc
具体步骤
step1:
在home目录下创建文件wordcount.txt,内容如下:
hello tom
hello rose
hello jerry
hello TBL
hello tom
hello kitty
hello rose
hello TBL
hello ZDP
hello ZDP
hello TBL
[root@node01 home]# vi wordcount.txt
hello tom
hello rose
hello jerry
hello TBL
hello tom
hello kitty
hello rose
hello TBL
hello ZDP
hello ZDP
hello TBL
[root@node01 home]# ll
总用量 48932
drwxr-xr-x 3 root root 4096 8月 26 16:08 hadoop
-rw-r--r-- 1 root root 116 8月 28 02:07 wordcount.txt
step2:
在hdfs创建存放wordcount.txt文件的目录/wc/input/
[root@node01 home]# hadoop fs -mkdir -p /wc/input/
19/08/28 02:11:22 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
将刚才创建的wordcount.txt上传到hdfs的/wc/output/
[root@node01 home]# hadoop fs -copyFromLocal ./wordcount.txt /wc/input/
19/08/28 02:12:16 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
step3:
cd /home/hadoop/apps/hadoop-2.8.0/share/hadoop/mapreduce
[root@node01 home]# cd /home/hadoop/apps/hadoop-2.8.0/share/hadoop/mapreduce
[root@node01 mapreduce]# ll
总用量 5088
-rw-r--r-- 1 502 dialout 562900 3月 17 2017 hadoop-mapreduce-client-app-2.8.0.jar
-rw-r--r-- 1 502 dialout 782739 3月 17 2017 hadoop-mapreduce-client-common-2.8.0.jar
-rw-r--r-- 1 502 dialout 1571179 3月 17 2017 hadoop-mapreduce-client-core-2.8.0.jar
-rw-r--r-- 1 502 dialout 195000 3月 17 2017 hadoop-mapreduce-client-hs-2.8.0.jar
-rw-r--r-- 1 502 dialout 31533 3月 17 2017 hadoop-mapreduce-client-hs-plugins-2.8.0.jar
-rw-r--r-- 1 502 dialout 66999 3月 17 2017 hadoop-mapreduce-client-jobclient-2.8.0.jar
-rw-r--r-- 1 502 dialout 1587158 3月 17 2017 hadoop-mapreduce-client-jobclient-2.8.0-tests.jar
-rw-r--r-- 1 502 dialout 75495 3月 17 2017 hadoop-mapreduce-client-shuffle-2.8.0.jar
-rw-r--r-- 1 502 dialout 301934 3月 17 2017 hadoop-mapreduce-examples-2.8.0.jar
drwxr-xr-x 2 502 dialout 4096 3月 17 2017 jdiff
drwxr-xr-x 2 502 dialout 4096 3月 17 2017 lib
drwxr-xr-x 2 502 dialout 4096 3月 17 2017 lib-examples
drwxr-xr-x 2 502 dialout 4096 3月 17 2017 sources
step4:
执行hadoop官方提供的mapreduce的wordcount的例子
hadoop jar hadoop-mapreduce-examples-2.8.0.jar wordcount /wc/input/wordcount.txt /wc/output/
命令说明:
hadoop jar :用hadoop发方式运行jar文件
hadoop-mapreduce-examples-2.8.0.jar:具体的jar文件
wordcount:jar文件中的具体类
/wc/input/wordcount.txt:wordcount类运行需要的第一个参数,hdfs文件系统的输入目录
/wc/output/:wordcount类运行需要的第二个参数,hdfs文件系统的输出目录
stet5:
查看执行完wordcount后,hdfs的输出目录,最后的计算结果如下:
TBL 3
ZDP 2
hello 11
jerry 1
kitty 1
rose 2
tom 2
[root@node01 mapreduce]# hadoop fs -cat /wc/output/part-r-00000
19/08/28 02:32:43 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
TBL 3
ZDP 2
hello 11
jerry 1
kitty 1
rose 2
tom 2
二、自己编写wc文件
package Maps;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* (hello,1)(tom,1)(hello,1)(rode,1)......
*
* map程序逐渐读取数据,将每一行数据的起始偏移量作为key,类型是LongWritable,将每一行数据的内容作为value(Text)
* 经过map阶段的处理,输出的key是Text,value是IntWritable:
*
* 调用机制:
* 1、谁在调用:map
* 2、怎么调用:是在当前MapReduce过程中,每读取一行注释,调用一次map
*/
public class WordCountMap extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String word = value.toString();
String[] words = word.split(" ");
for (String w:words){
context.write(new Text(w), new IntWritable(1));
}
}
}
package Maps;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* (hello,1)(hello, 1)(hello, 1)................
* (hello,(1,1,1,1........))
*
* reduce调用机制
* 1、谁在调用:reduce阶段在调用
* 2、怎么调用:reduce收到一组key相同的数据,调用一次
*/
public class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
Integer count = 0;
for (IntWritable v : values){
count ++ ;
}
context.write(key, new IntWritable(count));
}
}
package Maps;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class WordCountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("yarn.resorcemanager.hostname", "node01");
conf.set("fs.deafutFS", "hdfs://node01:9000/");
Job job = Job.getInstance(conf);
job.setJarByClass(WordCountDriver.class);
//设置本次job是使用map,reduce
job.setMapperClass(WordCountMap.class);
job.setReducerClass(WordCountReduce.class);
//设置本次map和reduce输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputKeyClass(IntWritable.class);
//制定本次job读取元数据所需要组件,源文件在hdfs的文本文档中
job.setInputFormatClass(TextInputFormat.class);
//制定本次job输出数据所需要的组件,输出到hdfs文件中用TextOutputFormat
job.setOutputFormatClass(TextOutputFormat.class);
//设置输入路径
FileInputFormat.setInputPaths(job, new Path("args[0]"));
FileOutputFormat.setOutputPath(job, new Path("args[1]"));
//提交任务,客户端返回
job.submit();
//核心代码:提交jar程序给yarn,客户端不退出,等待接受MapReduce的进度信息,打印进度信息,等待结果
//客户端ture的含义:等着
//result:返回true:则跑完了;false:出错了
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
打成jar包
将打成的jar包导入Xshell
[root@node01 home]# ll
总用量 48932
drwxr-xr-x 3 root root 4096 8月 26 16:08 hadoop
-rw-r--r-- 1 root root 50095009 9月 6 2019 Map.jar
-rw-r--r-- 1 root root 116 8月 28 02:07 wordcount.txt
[root@node01 home]# hadoop jar Map.jar /wc/input/wordcount.txt /wc/out1put/
[root@node01 home]# hadoop fs -cat /wc/out1put/part-r-00000
19/08/28 03:14:01 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
TBL 3
ZDP 2
hello 11
jerry 1
kitty 1
rose 2
tom 2
三、本地运行
package Maps;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
//import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class WordCountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//conf.set("yarn.resorcemanager.hostname", "node01");
//conf.set("fs.deafutFS", "hdfs://node01:9000/");
Job job = Job.getInstance(conf);
job.setJarByClass(WordCountDriver.class);
//设置本次job是使用map,reduce
job.setMapperClass(WordCountMap.class);
job.setReducerClass(WordCountReduce.class);
//设置本次map和reduce输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputKeyClass(IntWritable.class);
//制定本次job读取元数据所需要组件,源文件在hdfs的文本文档中
//job.setInputFormatClass(TextInputFormat.class);
//制定本次job输出数据所需要的组件,输出到hdfs文件中用TextOutputFormat
//job.setOutputFormatClass(TextOutputFormat.class);
//设置输入路径
FileInputFormat.setInputPaths(job, new Path("E:\\ziliao\\Hadoop\\driver\\wc\\input\\"));
FileOutputFormat.setOutputPath(job, new Path("E:\\ziliao\\Hadoop\\driver\\wc\\output"));
//提交任务,客户端返回
job.submit();
//核心代码:提交jar程序给yarn,客户端不退出,等待接受MapReduce的进度信息,打印进度信息,等待结果
//客户端ture的含义:等着
//result:返回true:则跑完了;false:出错了
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}