现有如下文件,分别代表某一年每个名字对应的性别,和对应的人数。
每个文件内数据格式如下(取其中的一部分)
Mary,F,7065
Anna,F,2604
Emma,F,2003
Elizabeth,F,1939
Minnie,F,1746
Margaret,F,1578
Ida,F,1472
Alice,F,1414
...
Claud,M,90
Roscoe,M,90
Sylvester,M,89
Earnest,M,88
Hiram,M,88
Otis,M,88
...
要求:统计每一年 F(女)M(男)对应的人数
格式如下:
1880 F:90993 M:110491
基于本地仿真:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class TestPeopleCount {
public static void main(String[] args) throws Exception {
//1.创建job
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//2.设置数据的格式类型 决定读入和写出的数据的方式
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
//3.设置数据的读入和写入路径
//所有的文件所在目录
Path src = new Path("file:///E:/hadoop_test/names");
//计算结果的输出目录
Path res = new Path("file:///E:/hadoop_test/res");
//如果结果目录已经存在,删除该目录
FileSystem fileSystem = FileSystem.get(conf);
if (fileSystem.exists(res)) {
fileSystem.delete(res, true);
}
TextInputFormat.addInputPath(job, src);
TextOutputFormat.setOutputPath(job, res);
//4.设置数据的处理规则
job.setMapperClass(PeopleMapper.class);
job.setReducerClass(PeopleReducer.class);
//5.设置Mapper和Reducer的key,value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//6.提交任务job
job.waitForCompletion(true);
}
static class PeopleMapper extends Mapper<LongWritable, Text, Text, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
FileSplit split = (FileSplit) context.getInputSplit();
String name = split.getPath().getName();
//从文件名中提取出年份
StringBuilder sb = new StringBuilder();
char[] chars = name.toCharArray();
for (char c : chars) {
if (c >= '0' && c <= '9') {
sb.append(c);
}
}
String[] tokens = value.toString().split(",");
String sex = tokens[1];
String counts = tokens[2];
//key 年份 值 性别 数量
context.write(new Text(sb.toString()), new Text(sex+","+counts));
}
}
static class PeopleReducer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int males = 0;
int females = 0;
for (Text value : values) {
String[] sexAndCount = value.toString().split(",");
if(sexAndCount[0].equals("F")){
females+=Integer.valueOf(sexAndCount[1]);
}else{
males+=Integer.valueOf(sexAndCount[1]);
}
}
context.write(key, new Text("F:"+females+"\t"+"M:"+males));
}
}
}
运行结果
1880 F:90993 M:110491
1881 F:91953 M:100743
1882 F:107847 M:113686
1883 F:112319 M:104627
1884 F:129020 M:114442
1885 F:133055 M:107799
1886 F:144533 M:110784
1887 F:145981 M:101413
1888 F:178622 M:120851
1889 F:178366 M:110580
1890 F:190376 M:111025
1891 F:185481 M:101191
1892 F:212339 M:122036
1893 F:212905 M:112318
1894 F:222921 M:115769
1895 F:233627 M:117395
1896 F:237918 M:119567
1897 F:234200 M:112757
1898 F:258768 M:122690
1899 F:233023 M:106210
1900 F:299800 M:150483
1901 F:239345 M:106469
1902 F:264076 M:122660
1903 F:261971 M:119233
1904 F:275364 M:128125
...