MyMapper
package com.ny.mapreduce;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
/*
* map数据类型不使用基本数据类型
*
* 要用hadoop的数据类型
*/
public class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
//key放置字符串的偏移量(行的偏移量)//value 放的是一行数据 Object -> LongWritable
public void map(Object key, Text value, Context context) throws IOException,
InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
package com.ny.mapreduce;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
//reduce的输入是map的输出 <KEYIN,VALUEIN,KEYOUT,VALUEOUT>
public class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
//相同的key为一组...调用一次reduce方法,在方法内迭代这一组数据,进行计算:sum count max min
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
//hello 1
//hello 1
//hello 1
//hello 1
//hello 1
//hello 1
// key:hello
// values:(1,1,1,1)
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
WordCount
package com.ny.mapreduce;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.FilterOutputFormat;
public class WordCount {
/*
* 先写客户端
*/
@SuppressWarnings("unused")
public static void main(String[] args) throws Exception {
//获取配置文件的对象
Configuration conf = new Configuration(true);
//在hdfs中的客户端抽象成了FileSystem 在Mapreduce中客户端抽象成了Job
//将conf传入获取 里面有相关角色的配置信息了
//获取conf中的所有信息
Job job = Job.getInstance(conf);
// Create a new Job
// Job job = Job.getInstance();
//打jar包的时候要把类名加进来
job.setJarByClass(WordCount.class);
// Specify various job-specific parameters
//给作业设置一个名字myjob
job.setJobName("myjob");
// job.setInputPath(new Path("in"));
// job.setOutputPath(new Path("out"));
//定义输入路径
Path input = new Path("/user/root/wc.txt");
FileInputFormat.addInputPath(job,input);
//定义输出路径;
Path output = new Path("/wc/output");
//判断目录是否存在
if(output.getFileSystem(conf).exists(output)) {
output.getFileSystem(conf).delete(output,true);
}
FileOutputFormat.setOutputPath(job, output);
//设置Map环节的类
job.setMapperClass(MyMapper.class);
//key的类型 输出的类型
job.setMapOutputKeyClass(Text.class);
//value的类型 输出类型IntWritable
job.setMapOutputValueClass(IntWritable.class);
//设置reduce环节的类
job.setReducerClass(MyReducer.class);
// Submit the job, then poll for progress until the job is complete
job.waitForCompletion(true);
}
}
打项目的jar包
Windows Jdk的版本为1.8 Linux中Jdk版本为1.7
这是由较高版本的JDK编译的java class文件试图在较低版本的JVM上运行产生的错误。
所以windows中的jdk版本要和Linux中的jdk版本保持一致