pagerank算法_一个PageRank算法的MapReduce实现案例

最新推荐文章于 2025-03-18 20:13:50 发布

weixin_39854369

最新推荐文章于 2025-03-18 20:13:50 发布

阅读量287

点赞数

文章标签： pagerank算法

测试数据如下：

A	B	D
B	C
C	A	B
D	B	C

注意：第一列与第二列之间是制表符't',第二列与第三列之间是空格符

需求：ABCD分别代表4个网页，计算其入链权重（PR）排名网页的顺序。

网页之间的关系图如下：

算法原理：

首先，先创建一个Node类来封装每行数据，代码如下：

package hadoop.mr;

import org.apache.commons.lang.StringUtils;

import java.io.IOException;
import java.util.Arrays;

public class Node {
    private double pageRank = 1.0;
    private String[] adjacentNodeNames;

    private static final char fieldSeparator = 't';

    public double getPageRank() {
        return pageRank;
    }

    public Node setPageRank(double pageRank) {
        this.pageRank = pageRank;
        return this;
    }

    public String[] getAdjacentNodeNames() {
        return adjacentNodeNames;
    }

    public Node setAdjacentNodeNames(String[] adjacentNodeNames) {
        this.adjacentNodeNames = adjacentNodeNames;
        return this;
    }

    public boolean containAdjacentNodes(){
        return adjacentNodeNames!=null&&adjacentNodeNames.length>0;
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append(pageRank);
        if (getAdjacentNodeNames()!=null){
            sb.append(fieldSeparator).append(StringUtils.join(getAdjacentNodeNames(),fieldSeparator));
        }
        return sb.toString();
    }

    public static Node fromMR(String value) throws IOException{
        //将value按制表符进行切割
        String[] parts = StringUtils.splitPreserveAllTokens(value,fieldSeparator);
        //value为空将报异常
        if (parts.length<1){
            throw new IOException("Expected 1 or more parts but received"+parts.length);
        }
        //将1或者后面的0.3这样数赋值pageRank并返回对象
        Node node = new Node().setPageRank(Double.valueOf(parts[0]));
        if(parts.length>1){
            //将B D做成一个数组放到对象中
            node.setAdjacentNodeNames(Arrays.copyOfRange(parts,1,parts.length));

        }
        return node;
    }

    public static Node fromMR(String v1,String v2) throws IOException{
        return fromMR(v1+fieldSeparator+v2);
    }

}

然后创建Job类：

package hadoop.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;


public class RunJob {
    public static enum MyCounter{
        my
    }

    public static void main(String[] args) {
        Configuration conf = new Configuration();
        //如果分布式运行，必须打成jar包
        //且client在集群外非hadoop.jar这种方式运行，client中必须配置jar的位置
        //下面这行是hadoop2.5后出现的，默认为false，如果想运行在window系统中，需要设置为true
        conf.set("mapreduce.app-submission.corss-platform","true");
        //这个配置，只属于切换分布式到本地单进程模拟运行的配置
        //这种方式不是分布式，所以不用打jar包。
        conf.set("mapreduce.","local");

        double d = 0.001;
        int i = 0;
        while (true){
            i++;
            try{
                conf.setInt("runCount",i);
                FileSystem fs = FileSystem.get(conf);
                Job job = Job.getInstance(conf);
                job.setJarByClass(RunJob.class);
                job.setJobName("pr"+i);
                job.setMapperClass(PageRankMapper.class);
                job.setReducerClass(PageRankReducer.class);
                job.setMapOutputKeyClass(Text.class);
                job.setMapOutputValueClass(Text.class);
                //这是帮忙做切割，按制表符，制表符前面的就是key，后面的是value
                job.setInputFormatClass(KeyValueTextInputFormat.class);

                //数据的输入分两种，第一次是原始数据，从input拿，后面都是从上一次输出拿，即上一次的输出是下一次的输入
                Path input = new Path("/data/pagerank/input");
                if (i>1){
                    input = new Path("/data/pagerank/output/pr"+(i-1));
                }
                FileInputFormat.addInputPath(job,input);

                //输出
                Path output = new Path("/data/pagerank/output/pr"+i);
                if (fs.exists(output)){
                    fs.delete(output,true);
                }
                FileOutputFormat.setOutputPath(job,output);

                boolean f = job.waitForCompletion(true);
                if (f){
                    System.out.println("success");
                    //数据处理成功后拿被my标识的值，实际就是pr的差值
                    long sum = job.getCounters().findCounter(MyCounter.my).getValue();
                    System.out.println(sum);
                    double avgd = sum/4000.0;
                    if (avgd<d){
                        break;
                    }
                }
            }catch (Exception e){
                e.printStackTrace();
            }
        }
    }

    private static  class  PageRankMapper extends Mapper<Text,Text,Text,Text>{
        @Override
        protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
            //从配置文件中取runCount的值，在客户端中已经配过，是i的值，这里如果取不到，就等于1返回
            int runCount = context.getConfiguration().getInt("runCount",1);

            //第一次A  B D
            //第二次A  0.3 B D
            String page = key.toString();//KEY就是A这个位置的值
            Node node = null;
            if (runCount==1){
                //如果是第一次，给一个1+'t'+B D
                node = Node.fromMR("1.0",value.toString());
            }else {
                node=Node.fromMR(value.toString());
            }
            //toString的结果是1.0+'t'+B+'t'+D
            context.write(new Text(page),new Text(node.toString()));
            //假如数组不为空且长度不为零时结果为真
            if(node.containAdjacentNodes()){
                double outValue = node.getPageRank()/node.getAdjacentNodeNames().length;//看有几个入链，平均分
                for (int i = 0;i<node.getAdjacentNodeNames().length;i++){
                    String outPage = node.getAdjacentNodeNames()[i];
                    context.write(new Text(outPage),new Text(outValue+""));//结果就是B 0.5 D 0.5
                }
            }
        }
    }

    private static class PageRankReducer extends Reducer<Text,Text,Text,Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            //进来的数据有两种
            double sum = 0.0;

            Node sourceNode = null;
            for (Text i : values) {
                Node node = Node.fromMR(i.toString());
                if (node.containAdjacentNodes()){
                    //第一种：对应关系的数据
                    sourceNode = node;
                }else {
                    //第二种：就是B 0.5的这种数据
                    sum = sum + node.getPageRank();
                }
            }
            double newPR = (0.15/4)+(0.85*sum);//套用公式(1-d)+d*到改页面的入链值和d为0.85的阻尼系数，谷歌给的值，n是页面数
            System.out.println("***********new pageRank value is"+newPR);

            double d = newPR - sourceNode.getPageRank();

            int j = (int)(d*1000.0);
            //相差的值可能是负数，这里给它取了绝对值
            j=Math.abs(j);
            System.out.println(j+"--------------------------");
            //将差值存放到配置文件中，因为整个过程在循环中，方法读取进行比较
            context.getCounter(MyCounter.my).increment(j);

            sourceNode.setPageRank(newPR);
            context.write(key,new Text(sourceNode.toString()));
        }
    }
}