Hadoop MoviesCount

luzyuu

于 2019-09-28 15:36:23 发布

阅读量235

点赞数

本文链接：https://blog.youkuaiyun.com/u013529093/article/details/101618923

版权

本文介绍了一种使用MapReduce算法对电影评分数据进行分析的方法。通过三个阶段的MapReduce作业，首先统计相同电影ID和评分组合的用户ID，然后对用户ID对的评分进行汇总，最后对结果进行排序，输出用户对之间的评分统计结果。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

http://18.163.52.98/

/** author LU ZHENGYU  UPDATA ON Spet. 9 2019
 *
 */
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.io.IntWritable.Comparator;
import org.apache.hadoop.io.WritableComparable;
import java.io.IOException;
import java.util.ArrayList;

public class MoviesCount {

    /*********第一个Mapreduce 统计相同MovieID，Rating对的的UserID
     * Mapper 把MovieID 和 Rating 作为Key UID作为Value
     * Reducer 从 UID的Iterable 找出 所有组合 并输出，value为1
     */
    public static class First_Mapper extends Mapper<Object, Text, Text, IntWritable> {

        @Override
        public void map(Object key, Text value, Context context) throws NumberFormatException, IOException, InterruptedException {
            // 不处理CSV表中第一行的数据
            if (key.toString().equals("0")) {
                return;
            }
            // 逐行读取，内容在value中  按逗号拆分每一行 userId,movieId,rating e.g 1,1,4
            String[] stringrray = value.toString().split(",");
            // 构建movieId和rating的key
            String MRkey = stringrray[1] + ", " + stringrray[2];
            context.write(new Text(MRkey), new IntWritable(Integer.parseInt(stringrray[0])));

        }

    }

    public static class First_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {

        private final static IntWritable one = new IntWritable(1);

        @Override
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            // 构建数组存储一个Iterable对象中的UID，并做组合操作
            ArrayList<Integer> arrayList = new ArrayList<Integer>();
            for (IntWritable value : values) {
                arrayList.add(value.get());
            }
            for (int i = 0; i < arrayList.size(); i++) {
                for (int j = i + 1; j < arrayList.size(); j++) {
                    String Smallone;
                    String Bigone;
                    if( Integer.parseInt(arrayList.get(i).toString()) > Integer.parseInt(arrayList.get(j).toString())){
                        Smallone = arrayList.get(j).toString();
                        Bigone = arrayList.get(i).toString();
                    }
                    else{   Smallone = arrayList.get(i).toString();
                            Bigone = arrayList.get(j).toString();
                    }
                    String UIDkey = Bigone + "," + Smallone ;
                    context.write(new Text(UIDkey), one);
                }
            }
        }
    }

    /*********第二个Mapreduce 对相同的UID_A, UID_B做累计统计
     * Mapper 直接把 UID_A , UID_B  <1> 传给 reduce
     * Reducer 对 UID对的求和
     */
    public static class Second_Mapper extends Mapper<Object, Text, Text, IntWritable> {

        @Override
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            //注意输出的是制表符 不是空格
            String[] array = value.toString().split("\t");
            context.write(new Text(array[0]), new IntWritable(Integer.parseInt(array[1])));
        }

    }

    public static class Second_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int Count = 0;
            for (IntWritable value : values) {
                Count += value.get();
            }
            context.write(key, new IntWritable(Count));
        }
    }
    /*********第三个Mapreduce 对 结果排序
     * Mapper 把 UID_A , UID_B  <sum> key，value对换 传给reduce
     * Reducer  排序 再对换输出到文件
     */
    public static class Third_Mapper extends Mapper<Object,Text,IntWritable,Text>{
        @Override
        public void map(Object key,Text value,Context context) throws NumberFormatException, IOException, InterruptedException{
            String[] split = value.toString().split("\t");
            context.write(new IntWritable(Integer.parseInt(split[1])),new Text(split[0]));
        }
    }


    public static class Third_Reducer extends Reducer<IntWritable,Text,Text,IntWritable>{
        @Override
        public void reduce(IntWritable key,Iterable<Text>values,Context context) throws IOException, InterruptedException{
            for (Text text : values) {
                context.write(text,key);
            }
        }
    }


    /**
     * comparator refer to https://blog.youkuaiyun.com/Gamer_gyt/article/details/48025805
     * 输出排序
     */
    private static class DecreasingComparator extends Comparator {
        @SuppressWarnings("rawtypes")
        public int compare(WritableComparable a, WritableComparable b) {
            return -super.compare(a, b);
        }

        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            return -super.compare(b1, s1, l1, b2, s2, l2);
        }
    }

    public static void main(String[] args) throws IOException {

        JobConf conf = new JobConf(MoviesCount.class);

        //Refer to http://www.tuicool.com/articles/N7buuy
        //配置第一个job
        Job job1 = new Job(conf, "join1");
        job1.setJarByClass(MoviesCount.class);
        //设置对应的Mapper 和 Reducer
        job1.setMapperClass(First_Mapper.class);
        job1.setReducerClass(First_Reducer.class);
        //map阶段的输出的key
        job1.setMapOutputKeyClass(Text.class);
        //map阶段的输出的value
        job1.setMapOutputValueClass(IntWritable.class);
        //reduce阶段的输出的key
        job1.setOutputKeyClass(Text.class);
        //reduce阶段的输出的value
        job1.setOutputValueClass(IntWritable.class);

        //加入控制容器
        ControlledJob ctrl_job1 = new ControlledJob(conf);
        ctrl_job1.setJob(job1);
        //job1的输入输出文件路径
        FileInputFormat.addInputPath(job1, new Path(args[0]));
        FileOutputFormat.setOutputPath(job1, new Path(args[1]));

        //配置第二个job
        Job job2 = new Job(conf, "Join2");
        job2.setJarByClass(MoviesCount.class);

        job2.setMapperClass(Second_Mapper.class);
        job2.setReducerClass(Second_Reducer.class);

        //map阶段的输出的key
        job2.setMapOutputKeyClass(Text.class);
        //map阶段的输出的value
        job2.setMapOutputValueClass(IntWritable.class);
        //reduce阶段的输出的key
        job2.setOutputKeyClass(Text.class);
        //reduce阶段的输出的value
        job2.setOutputValueClass(IntWritable.class);



        //作业2加入控制容器
        ControlledJob ctrl_job2 = new ControlledJob(conf);
        ctrl_job2.setJob(job2);

        // job2的启动，依赖于job1作业的完成
        ctrl_job2.addDependingJob(ctrl_job1);

        //输入路径是上一个作业的输出路径，因此这里填args[1],要和上面对应好
        FileInputFormat.addInputPath(job2, new Path(args[1]));

        //输出路径从新传入一个参数，这里需要注意，因为我们最后的输出文件一定要是没有出现过得
        //因此我们在这里new Path(args[2])因为args[2]在上面没有用过，只要和上面不同就可以了
        FileOutputFormat.setOutputPath(job2, new Path(args[2]));


        //配置第三个job
        Job job3 = new Job(conf, "Join3");
        job3.setJarByClass(MoviesCount.class);

        job3.setMapperClass(Third_Mapper.class);
        job3.setReducerClass(Third_Reducer.class);

        //map阶段的输出的key
        job3.setMapOutputKeyClass(IntWritable.class);
        //map阶段的输出的value
        job3.setMapOutputValueClass(Text.class);
        //reduce阶段的输出的key
        job3.setOutputKeyClass(Text.class);
        //reduce阶段的输出的value
        job3.setOutputValueClass(IntWritable.class);

        job3.setSortComparatorClass(DecreasingComparator.class);

        //作业3加入控制容器
        ControlledJob ctrl_job3 = new ControlledJob(conf);
        ctrl_job3.setJob(job3);

        // job3的启动，依赖于job2作业的完成
        ctrl_job3.addDependingJob(ctrl_job2);

        FileInputFormat.addInputPath(job3, new Path(args[2]));
        FileOutputFormat.setOutputPath(job3, new Path(args[3]));


        //主的控制容器，控制上面的总的两个子作业
        //导包的时候注意要导新包org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
        //参数为组名，所有job组成一个组
        JobControl jobCtrl = new JobControl("ctrl");

        //添加到总的JobControl里，进行控制
        jobCtrl.addJob(ctrl_job1);
        jobCtrl.addJob(ctrl_job2);
        jobCtrl.addJob(ctrl_job3);

        //在线程启动，记住一定要有这个
        new Thread(jobCtrl).start();


        while (true) {

            if (jobCtrl.allFinished()) {
                //如果作业成功完成，就打印成功作业的信息
                System.out.println(jobCtrl.getSuccessfulJobList());
                jobCtrl.stop();
                break;
            }
        }
    }
}

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>8</source>
                    <target>8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>

    <properties>
        <hadoopVersion>2.9.2</hadoopVersion>
    </properties>

    <dependencies>
    <!-- Hadoop start -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>${hadoopVersion}</version>
    </dependency>

    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-hdfs</artifactId>
        <version>${hadoopVersion}</version>
    </dependency>

    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>${hadoopVersion}</version>
    </dependency>

        <!-- Hadoop -->
    <dependency>
        <groupId>jdk.tools</groupId>
        <artifactId>jdk.tools</artifactId>
        <version>1.7</version>
        <scope>system</scope>
        <systemPath>D:/java7/lib/tools.jar</systemPath>
     </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>3.8.1</version>
            <scope>test</scope>
        </dependency>

  </dependencies>

# Configure logging for testing: optionally with log file
log4j.rootLogger=WARN, stdout
# log4j.rootLogger=WARN, stdout, logfile

log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n

log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n