Windows的Eclipse已经连接到Linux的hadoop。
一、给两个文件去重
文件1:
20150101 x
20150102 y
20150103 x
20150104 y
20150105 z
20150106 x
文件2:
20150101 y
20150102 y
20150103 x
20150104 z
20150105 y
目标输出文件:
20150101 x
20150101 y
20150102 y
20150103 x
20150104 y
20150104 z
20150105 y
20150105 z
20150106 x
代码思路:直接利用reduce的合并功能。
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Merge
{
public static class Map extends Mapper<Object, Text, Text, Text>
{
public void map(Object key, Text value, Context context)throws IOException, InterruptedException
{
context.write(value, new Text(""));
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text>
{
public void reduce(Text key, Iterable<Text>values, Context context)throws IOException, InterruptedException
{
context.write(key, new Text(""));
}
}
public static void main(String []args)throws Exception
{
Configuration conf = new Configuration();
conf.set("fs.default.name", "hdfs://192.168.29.3:9000");
String []otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if(otherArgs.length != 2)
{
System.err.print("No such files.(input output)");
System.exit(2);
}
Job job = Job.getInstance(conf, "first example");
job.setJarByClass(Merge.class);
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);//设置输出类型
job.setOutputValueClass(Text.class);//设置输出类型
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));//设置输入文件,Arguments手动设置
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//设置输出文件,Arguments手动设置
System.exit(job.waitForCompletion(true)?0:1);
}
}
二、对数字排序
输入文件:
200
9
1
30
12
18
输出文件:
1 1
2 9
3 12
4 18
5 30
6 200
代码思路:虽然map后传给reduce是按key自动排好序的,但还是要自定义Partitioner函数,因为要将数字分到同一个reduce上面。
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Sort
{
public static class Map extends Mapper<Object, Text, IntWritable, IntWritable>
{
public void map(Object key, Text value, Context context)throws IOException, InterruptedException
{
String tmp = value.toString();
context.write(new IntWritable(Integer.parseInt(tmp)), new IntWritable(1));
}
}
public static class Reduce extends Reducer<IntWritable,IntWritable,IntWritable,IntWritable>
{
private static IntWritable id = new IntWritable(1);
public void reduce(IntWritable key, Iterable<IntWritable>values, Context context)throws IOException, InterruptedException
{
for(IntWritable value : values)//输入文件可能有重复的值
{
context.write(id, key);
id = new IntWritable(id.get()+1);
}
}
}
public static class Partition extends Partitioner<IntWritable, IntWritable>
{
public int getPartition(IntWritable key, IntWritable value, int num_part)
{
int maxnum = 65223;
int bound = maxnum/num_part+1;
int keynum = key.get();
for(int i=0; i<num_part; ++i)
if(i*bound<=keynum && (i+1)*bound>keynum)
return i;
return -1;
}
}
public static void main(String []args)throws Exception
{
Configuration conf = new Configuration();
conf.set("fs.default.name", "hdfs://192.168.29.3:9000");
String []otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if(otherArgs.length != 2)
{
System.err.print("no such file.(input output)");
System.exit(2);
}
Job job = Job.getInstance(conf, "example 2");
job.setJarByClass(Sort.class);
job.setMapperClass(Map.class);
job.setPartitionerClass(Partition.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}
三、挖掘祖孙关系
输入文件:父子关系
child parent
Steven Lucy
Steven Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Frank
Jack Alice
Jack Jesse
David Alice
David Jesse
Philip David
Philip Alma
Mark David
Mark Alma
输出文件:爷孙关系
grand_child grand_parent
Mark Jesse
Mark Alice
Philip Jesse
Philip Alice
Jone Jesse
Jone Alice
Steven Jesse
Steven Alice
Steven Frank
Steven Mary
Jone Frank
Jone Mary
代码思路:对于每一个父子关系,输出两个<key,value><key,value>,分别是<son,1father>、<father,2son><son,1father>、<father,2son>,那么在reduce时就能找到每个key对应的所有儿子和所有父亲(由1和2区分),从而输出所有爷孙关系。
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Parent
{
public static int time = 0;
public static class Map extends Mapper<Object, Text, Text, Text>
{
public void map(Object key,Text value, Context context)throws IOException, InterruptedException
{
String val = value.toString();
int i=0;
while(val.charAt(i) != ' ') ++i;
String []values = {val.substring(0, i), val.substring(i+1, val.length())};
if(values[0].compareTo("child") != 0)
{
context.write(new Text(values[1]), new Text("1"+values[0]));
context.write(new Text(values[0]), new Text("2"+values[1]));
}
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text>
{
public void reduce(Text key, Iterable<Text>values, Context context)throws IOException, InterruptedException
{
if(time == 0)
{
context.write(new Text("grand_child"), new Text("grand_parent"));
++time;
}
String []parents = new String[20];
String []childs = new String[20];
int ps=0, cs=0;
Iterator it = values.iterator();
while(it.hasNext())
{
String tmp = it.next().toString();
if(tmp.charAt(0) == '1') childs[cs++] = tmp.substring(1,tmp.length());
else parents[ps++] = tmp.substring(1,tmp.length());
}
if(ps!=0 && cs!=0)
for(int i=0; i<cs; ++i)
for(int j=0; j<ps; ++j)
context.write(new Text(childs[i]), new Text(parents[j]));
}
}
public static void main(String []args)throws Exception
{
Configuration conf = new Configuration();
conf.set("fs.default.name", "hdfs://192.168.29.3:9000");
String []otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
if(otherArgs.length != 2)
{
System.err.print("no such file.input output");
System.exit(2);
}
Job job = Job.getInstance(conf, "example 3");//物联网1班黄律棋
job.setJarByClass(Parent.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}
本文介绍了使用Hadoop MapReduce解决三个实际问题的方法:文件去重、数字排序及挖掘祖孙关系。通过具体实例展示了MapReduce的强大功能,包括数据处理流程、代码实现细节等。
660

被折叠的 条评论
为什么被折叠?



