Windows端Eclipse运行MapReduce实例

本文介绍了使用Hadoop MapReduce解决三个实际问题的方法:文件去重、数字排序及挖掘祖孙关系。通过具体实例展示了MapReduce的强大功能,包括数据处理流程、代码实现细节等。

Windows的Eclipse已经连接到Linux的hadoop。

一、给两个文件去重

文件1:

20150101 x
20150102 y
20150103 x
20150104 y
20150105 z
20150106 x

文件2:

20150101 y
20150102 y
20150103 x
20150104 z
20150105 y

目标输出文件:

20150101 x  
20150101 y  
20150102 y  
20150103 x  
20150104 y  
20150104 z  
20150105 y  
20150105 z  
20150106 x

代码思路:直接利用reduce的合并功能。

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Merge
{
    public static class Map extends Mapper<Object, Text, Text, Text>
    {
        public void map(Object key, Text value, Context context)throws IOException, InterruptedException
        {
            context.write(value, new Text(""));
        }
    }
    public static class Reduce extends Reducer<Text, Text, Text, Text>
    {
        public void reduce(Text key, Iterable<Text>values, Context context)throws IOException, InterruptedException
        {
            context.write(key, new Text(""));
        }
    }
    public static void main(String []args)throws Exception
    {
        Configuration conf = new Configuration();
        conf.set("fs.default.name", "hdfs://192.168.29.3:9000");
        String []otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if(otherArgs.length != 2)
        {
            System.err.print("No such files.(input output)");
            System.exit(2);
        }
        Job job = Job.getInstance(conf, "first example");
        job.setJarByClass(Merge.class);
        job.setMapperClass(Map.class);
        job.setCombinerClass(Reduce.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);//设置输出类型
        job.setOutputValueClass(Text.class);//设置输出类型
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));//设置输入文件,Arguments手动设置
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//设置输出文件,Arguments手动设置
        System.exit(job.waitForCompletion(true)?0:1);

    }
}

二、对数字排序

输入文件:

200
9
1
30
12
18

输出文件:

1 1
2 9
3 12
4 18
5 30
6 200

代码思路:虽然map后传给reduce是按key自动排好序的,但还是要自定义Partitioner函数,因为要将数字分到同一个reduce上面。

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Sort
{
    public static class Map extends Mapper<Object, Text, IntWritable, IntWritable>
    {
        public void map(Object key, Text value, Context context)throws IOException, InterruptedException
        {
            String tmp = value.toString();
            context.write(new IntWritable(Integer.parseInt(tmp)), new IntWritable(1));
        }
    }
    public static class Reduce extends Reducer<IntWritable,IntWritable,IntWritable,IntWritable>
    {
        private static IntWritable id = new IntWritable(1);
        public void reduce(IntWritable key, Iterable<IntWritable>values, Context context)throws IOException, InterruptedException
        {
            for(IntWritable value : values)//输入文件可能有重复的值
            {
                context.write(id, key);
                id = new IntWritable(id.get()+1);
            }
        }
    }
    public static class Partition extends Partitioner<IntWritable, IntWritable>
    {
        public int getPartition(IntWritable key, IntWritable value, int num_part)
        {
            int maxnum = 65223;
            int bound = maxnum/num_part+1;
            int keynum = key.get();
            for(int i=0; i<num_part; ++i)
                if(i*bound<=keynum && (i+1)*bound>keynum)
                    return i;
            return -1;
        }
    }
    public static void main(String []args)throws Exception
    {
        Configuration conf = new Configuration();
        conf.set("fs.default.name", "hdfs://192.168.29.3:9000");
        String []otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if(otherArgs.length != 2)
        {
            System.err.print("no such file.(input output)");
            System.exit(2);
        }
        Job job = Job.getInstance(conf, "example 2");
        job.setJarByClass(Sort.class);
        job.setMapperClass(Map.class);
        job.setPartitionerClass(Partition.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true)?0:1);
    }
}

三、挖掘祖孙关系

输入文件:父子关系

child parent
Steven Lucy
Steven Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Frank
Jack Alice
Jack Jesse
David Alice
David Jesse
Philip David
Philip Alma
Mark David
Mark Alma

输出文件:爷孙关系

grand_child grand_parent
Mark    Jesse
Mark    Alice
Philip  Jesse
Philip  Alice
Jone    Jesse
Jone    Alice
Steven  Jesse
Steven  Alice
Steven  Frank
Steven  Mary
Jone    Frank
Jone    Mary

代码思路:对于每一个父子关系,输出两个<key,value><key,value>,分别是<son,1father><father,2son><son,1father>、<father,2son>,那么在reduce时就能找到每个key对应的所有儿子和所有父亲(由1和2区分),从而输出所有爷孙关系。

import java.io.IOException;
import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Parent
{
    public static int time = 0;
    public static class Map extends Mapper<Object, Text, Text, Text>
    {
        public void map(Object key,Text value, Context context)throws IOException, InterruptedException
        {
            String val = value.toString();
            int i=0;
            while(val.charAt(i) != ' ') ++i;
            String []values = {val.substring(0, i), val.substring(i+1, val.length())};
            if(values[0].compareTo("child") != 0)
            {
                context.write(new Text(values[1]), new Text("1"+values[0]));
                context.write(new Text(values[0]), new Text("2"+values[1]));
            }
        }
    }
    public static class Reduce extends Reducer<Text, Text, Text, Text>
    {
        public void reduce(Text key, Iterable<Text>values, Context context)throws IOException, InterruptedException
        {
            if(time == 0)
            {
                context.write(new Text("grand_child"), new Text("grand_parent"));
                ++time;
            }
            String []parents = new String[20];
            String []childs = new String[20];
            int ps=0, cs=0;
            Iterator it = values.iterator();
            while(it.hasNext())
            {
                String tmp = it.next().toString();
                if(tmp.charAt(0) == '1') childs[cs++] = tmp.substring(1,tmp.length());
                else parents[ps++] = tmp.substring(1,tmp.length());
            }
            if(ps!=0 && cs!=0)
                for(int i=0; i<cs; ++i)
                    for(int j=0; j<ps; ++j)
                        context.write(new Text(childs[i]), new Text(parents[j]));
        }
    }
    public static void main(String []args)throws Exception
    {
        Configuration conf = new Configuration();
        conf.set("fs.default.name", "hdfs://192.168.29.3:9000");
        String []otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if(otherArgs.length != 2)
        {
            System.err.print("no such file.input output");
            System.exit(2);
        }
        Job job = Job.getInstance(conf, "example 3");//物联网1班黄律棋
        job.setJarByClass(Parent.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job,  new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true)?0:1);
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值