Hadoop学习之网络爬虫+分词+倒排索引实现搜索引擎案例

本项目实现的是：自己写一个网络爬虫，对搜狐(或者csdn)爬取新闻(博客)标题,然后把这些新闻标题和它的链接地址上传到hdfs多个文件上，一个文件对应一个标题和链接地址，然后通过分词技术对每个文件中的标题进行分词，分词后建立倒排索引以此来实现搜索引擎的功能，建立倒排索引不熟悉的朋友可以看看我上篇博客
Hadoop–倒排索引过程详解

首先要自己写一个网络爬虫

由于我开始写爬虫的时候用了htmlparser，把所有搜到的链接存到队列，并且垂直搜索，这个工作量太大，爬了一个小时还没爬完造成了我电脑的死机，所以，现在我就去掉了垂直搜索，只爬搜狐的主页的新闻文章链接

不多说，看代码

首先看下载工具类，解释看注释

package com.yc.spider;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Random;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 下载工具类
 * @author 汤高 
 *
 */
public class DownLoadTool {
    /**
     * 下载页面的内容
     * 就是根据地址下载整个html标签
     * 
     * @param addr
     * @return
     */
    public String downLoadUrl(final String addr) {
        StringBuffer sb = new StringBuffer();
        try {
            URL url;
            if(addr.startsWith("http://")==false){
                String urladdr=addr+"http://";
                 url = new URL(urladdr);
            }else{
            System.out.println(addr);
            url = new URL(addr);
            }
            HttpURLConnection con = (HttpURLConnection) url.openConnection();
            con.setConnectTimeout(5000);
            con.connect();
            if (con.getResponseCode() == 200) {
                BufferedInputStream bis = new BufferedInputStream(con
                        .getInputStream());
                Scanner sc = new Scanner(bis,"gbk");
                while (sc.hasNextLine()) {
                    sb.append(sc.nextLine());
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return sb.toString();
    }

}


   
   1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
   
   1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

然后看一个文章链接的匹配类

package com.yc.spider;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 文章下载类
 * 
 * @author 汤高
 *        
 *         
 */
public class ArticleDownLoad {
    /**
     * 取出文章的a标记href
     */
    static String ARTICLE_URL = "<a\\s+[^<>]*\\s+href=\"?(http[^<>\"]*)\"[^<>]*>([^<]*)</a>";


    static Set<String> getImageLink(String html) {
        Set<String> result = new HashSet<String>();
        // 创建一个Pattern模式类，编译这个正则表达式
        Pattern p = Pattern.compile(ARTICLE_URL, Pattern.CASE_INSENSITIVE);
        // 定义一个匹配器的类
        Matcher matcher = p.matcher(html);
        while (matcher.find()) {
            System.out.println("======="+matcher.group(1)+"\t"+matcher.group(2));
            result.add(matcher.group(1)+"\t"+matcher.group(2));
        }
        return result;

    }


}

   
   1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
   
   1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

下面看爬虫类

package com.yc.spider;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.htmlparser.util.ParserException;
/**
 * 把爬到的内容上传到hdfs
 * @author 汤高
 * 
 *
 */
public class Spider {
    private DownLoadTool dlt = new DownLoadTool();
    private ArticleDownLoad pdl = new ArticleDownLoad();
    public void crawling(String url) {
        try {
            Configuration conf = new Configuration();
            URI uri = new URI("hdfs://192.168.52.140:9000");
            FileSystem hdfs = FileSystem.get(uri, conf);

            String html = dlt.downLoadUrl(url);
            Set<String> allneed = pdl.getImageLink(html);
            for (String addr : allneed) {
                //生成文件路径
                Path p1 = new Path("/myspider/" + System.currentTimeMillis());
                FSDataOutputStream dos = hdfs.create(p1);
                String a = addr + "\n";
                //把内容写入文件
                dos.write(a.getBytes());
            }
        } catch (IllegalArgumentException e) {
            e.printStackTrace();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

   
   1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
   
   1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

最后看测试类来爬内容

package com.yc.spider;

import java.io.FileNotFoundException;

import org.htmlparser.util.ParserException;

public class Test {
    public static void main(String[] args) throws ParserException {
        Spider s=new Spider();
        //  s.crawling("http://blog.youkuaiyun.com");
        s.crawling("http://www.sohu.com");

    }
}

   
   1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
   
   1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

看看结果
这里写图片描述

爬到的内容上传到hdfs上了

下面进行分词和建倒排索引
看分词所用到的包
这里写图片描述
我的分词用了lucenne5.5.1版本
中文分词用到了IKAnalyzer-5.0.1版本，但是与我的lucenne5.5.1不兼容，所以我做了一下兼容，然后自己打成了一个tanggao-IkAnalyzer.jar包，大家可以用现在最新的IKAnalyzer2012版本，应该可以兼容lucenne5.5.1版本

看代码

package com.tg.hadoop;

import java.io.IOException;
import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

import com.tg.Ik.IKAnalyzer5x;

public class AriticleInvertedIndex {
    /**
     * 
     * @author 汤高
     *
     */
    public static class ArticleInvertedIndexMapper extends Mapper<Text, Text, Text, Text>{

        private Text keyInfo = new Text();  // 存储单词和URI的组合
        private Text valueInfo = new Text(); //存储词频
        @Override
        protected void map(Text key, Text value, Mapper<Text, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {

            //把标题分词
             Analyzer analyzer = new IKAnalyzer5x(true);
               TokenStream ts = analyzer.tokenStream("field", new StringReader( value.toString().trim()  ));

                OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
                try {
                    ts.reset();
                    while (ts.incrementToken()) {
                        System.out.println(offsetAtt.toString().trim());
                        keyInfo.set( offsetAtt.toString()+":"+key);
                        valueInfo.set("1");
                        System.out.println("key"+keyInfo);
                        System.out.println("value"+valueInfo);
                        context.write(keyInfo, valueInfo);
                    }
                    ts.end();
                } finally {
                    ts.close();
                }
        }
    }

    public static class ArticleInvertedIndexCombiner extends Reducer<Text, Text, Text, Text>{
        private Text info = new Text();
        @Override
        protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {

            //统计词频
            int sum = 0;
            for (Text value : values) {
                sum += Integer.parseInt(value.toString() );
            }

            int splitIndex = key.toString().indexOf(":");

            //重新设置value值由URI和词频组成
            info.set( key.toString().substring( splitIndex + 1) +":"+sum );

            //重新设置key值为单词
            key.set( key.toString().substring(0,splitIndex));

            context.write(key, info);
            System.out.println("key"+key);
            System.out.println("value"+info);
        }
    }


    public static class ArticleInvertedIndexReducer extends Reducer<Text, Text, Text, Text>{

        private Text result = new Text();

        @Override
        protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {

            //生成文档列表
            String fileList = new String();
            for (Text value : values) {
                fileList += value.toString()+";";
            }
            result.set(fileList);

            context.write(key, result);
        }

    }

    public static void main(String[] args) {
        try {
            Configuration conf = new Configuration();

            Job job = Job.getInstance(conf,"InvertedIndex");
            job.setJarByClass(AriticleInvertedIndex.class);
            job.setInputFormatClass(KeyValueTextInputFormat.class);
            //实现map函数，根据输入的<key,value>对生成中间结果。
            job.setMapperClass(ArticleInvertedIndexMapper.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);

            job.setCombinerClass(ArticleInvertedIndexCombiner.class);
            job.setReducerClass(ArticleInvertedIndexReducer.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);


            FileInputFormat.addInputPath(job, new Path("hdfs://192.168.52.140:9000/myspider/"));
            FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.52.140:9000/myspider_outindex"+System.currentTimeMillis()+"/"));

            System.exit(job.waitForCompletion(true) ? 0 : 1);
        } catch (IllegalStateException e) {
            e.printStackTrace();
        } catch (IllegalArgumentException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

    }
}


   
   1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
   
   1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142