基于pagerank算法的运用Hbase的搜索引擎（1）——数据清洗篇

最新推荐文章于 2025-10-11 12:49:20 发布

原创最新推荐文章于 2025-10-11 12:49:20 发布 · 545 阅读

4 ·

CC 4.0 BY-SA版权

搜索引擎制作专栏收录该内容

5 篇文章

订阅专栏

数据清洗

数据来源

用nunch爬取的三层页面信息。

主要信息的字段含义：

inlinks 入链(url:linktext) 列族名:il
outlinks 出链(url:linktext) 列族名：ol
baseUrl 用于将网页源码中相对链接地址的转为绝对地址，通常就是当前网页的地址，有重定向的情况下，是最终定向到的地址列族名：f ，列名：bas
text 合并了解析出来的所有文本字段(utf-8)，用于普通的检索列族名：p，列名：c


import java.io.IOException;
import java.util.Arrays;
import java.util.Map.Entry;
import java.util.NavigableMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class CleanDataMR extends Configured implements Tool {
	public static void main(String[] args) throws Exception {
		ToolRunner.run(new CleanDataMR(), args);
	}

	@Override
	public int run(String[] args) throws Exception {
		Configuration conf = getConf();
		Job job = Job.getInstance(conf, "cleanData");
		String table1 = conf.get("before_table");
		String table2 = conf.get("clean_table");
		conf.set("hbase.zookeeper.quorum", "hadoopPD:2181");
		TableMapReduceUtil.initTableMapperJob(table1, new Scan(), CDMapper.class, ImmutableBytesWritable.class, MapWritable.class, job);
		TableMapReduceUtil.initTableReducerJob(table2, CDReducer.class, job);
		job.waitForCompletion(true);
		return 0;
	}

	public static class CDMapper extends TableMapper<ImmutableBytesWritable, MapWritable> {

		public MapWritable getMap(NavigableMap<byte[], byte[]> m) {
			//封装之后的Map
			MapWritable res = new MapWritable();
			for (Entry<byte[], byte[]> e: m.entrySet()) {
				res.put(new BytesWritable(e.getKey()), new BytesWritable(e.getValue()));
			}
			return res;
		}


		@Override
		protected void map(ImmutableBytesWritable key, Result result, Mapper<ImmutableBytesWritable, Result, ImmutableBytesWritable, MapWritable>.Context context) throws IOException, InterruptedException {
			// 获取到当前页面的url 自身行键其实是 url的相对路径，我们要拿url的绝对路径，取f:bas
			byte[] bas = result.getValue(Bytes.toBytes("f"), Bytes.toBytes("bas"));
			// 获取爬取状态 f:st
			byte[] st = result.getValue(Bytes.toBytes("f"), Bytes.toBytes("st"));
			// 转化f:st下的byte[]数组数据为int类型数据
			// 值为2的即为正确数据 在此基础进行操作
			int status = Bytes.toInt(st);
			if (status == 2) {
				// 需要获取到的数据有：t标题 s评分 cnt内容 iln入链个数 oln出链个数 iln入链列表 oln出链列表
				// 获取内容p:c
				byte[] cnt = result.getValue(Bytes.toBytes("p"), Bytes.toBytes("c"));
				// 获取到的为null,则重新置为空字符串，或者0字节数据
				if(cnt == null) {
					cnt = Bytes.toBytes("");
				}
				// 获取标题p:t
				byte[] title = result.getValue(Bytes.toBytes("p"), Bytes.toBytes("t"));
				// 获取到的为null,则重新置为空字符串，或者0字节数据
				if(title == null) {
					title = Bytes.toBytes("");
				}
				// 获得ol列族的Map数据
				NavigableMap<byte[], byte[]> ol_map = result.getFamilyMap(Bytes.toBytes("ol"));
				// 统计列族ol的行数
				int oln = ol_map.size();
				// 统计列族ol的链接列表
				MapWritable wol_map = getMap(ol_map);
				// 获得il列族的Map数据
				NavigableMap<byte[], byte[]> il_map = result.getFamilyMap(Bytes.toBytes("il"));
				// 统计列族il的行数
				int iln = il_map.size();
				// 统计列族il的链接列表
				MapWritable wil_map = getMap(il_map);
				// 新建MapWritable类型对象，为了把每个字段以键值对类型继续传递给reduce，封装到MapWritable对象输出
				MapWritable map = new MapWritable();
				// key值为c value值为 p:c
				map.put(new BytesWritable(Bytes.toBytes("c")), new BytesWritable(cnt));
				// key值为t value值为 p:t
				map.put(new BytesWritable(Bytes.toBytes("t")), new BytesWritable(title));
				// key值为oln value值为 出链接个数
				map.put(new BytesWritable(Bytes.toBytes("oln")), new BytesWritable(Bytes.toBytes(oln)));
				// key值为iln value值为 入链接个数
				map.put(new BytesWritable(Bytes.toBytes("iln")), new BytesWritable(Bytes.toBytes(iln)));
				// key值为ol_list value值为 出链接列表
				map.put(new BytesWritable(Bytes.toBytes("ol_list")), wol_map);
				// key值为il_list value值为 入链接列表
				map.put(new BytesWritable(Bytes.toBytes("il_list")), wil_map);
				// map输出结果 key为 baseUrl value为上面整理好的map集合
				context.write(new ImmutableBytesWritable(bas), map);
			}
		}
	}

	/**
	 * 将清洗之后的数据保存到Hbase集群的clean_webpage表中
	 * clean_webpage表需要预先创建
	 * create 'clean_webpage','page','il','ol'
	 * page列族用来存放 title，cnt，oln，iln
	 * il列族存放入链接情况 该列族下列名为 入链接url 值为入链接标签内容
	 * ol列族存放入链接情况 该列族下列名为 出链接url 值为出链接标签内容
	 */
	public static class CDReducer extends TableReducer<ImmutableBytesWritable, MapWritable, NullWritable> {
		/**
		 * 将数据整理一下格式存储到clean_webpage表中
		 */
		@Override
		protected void reduce(ImmutableBytesWritable key, Iterable<MapWritable> values, Context context)
				throws IOException, InterruptedException {
			// 取得values中的第一个值，即mapper输出的Map容器，因为url不会重复即key不会重复，此处也只有一个值(提示使用迭代器的next方法)
			MapWritable map = values.iterator().next();
			// 构建Put对象用来组织数据存入hbase，该对象的使用key作为行健
			Put put = new Put(key.get());
			// 在Map容器中取出key为t的value值,将其添加到put中,列族是page,列名是t
			put.addColumn(Bytes.toBytes("page"), Bytes.toBytes("t"), ((BytesWritable) (map.get(new BytesWritable(Bytes.toBytes("t"))))).getBytes());
			// 在Map容器中取出key为c的value值,将其添加到put中,列族是page,列名是c
			put.addColumn(Bytes.toBytes("page"), Bytes.toBytes("c"), ((BytesWritable) (map.get(new BytesWritable(Bytes.toBytes("c"))))).getBytes());
			// 在Map容器中取出key为oln的value值,将其添加到put中,列族是page,列名是oln
			put.addColumn(Bytes.toBytes("page"), Bytes.toBytes("oln"), ((BytesWritable) (map.get(new BytesWritable(Bytes.toBytes("oln"))))).getBytes());
			// 在Map容器中取出key为iln的value值,将其添加到put中,列族是page,列名是iln
			put.addColumn(Bytes.toBytes("page"), Bytes.toBytes("iln"), ((BytesWritable) (map.get(new BytesWritable(Bytes.toBytes("iln"))))).getBytes());
			// 在Map容器中取出key为ol_list的value值，此值的类型是MapWritable，代表了一整个出连接列表
			MapWritable ol_map = (MapWritable) map.get(new BytesWritable(Bytes.toBytes("ol_list")));
			// 遍历olList,数据插入ol 列族中，列名是olList中的key值，值是olList的value值
			for (Entry<Writable, Writable> e : ol_map.entrySet()) {
				BytesWritable k = (BytesWritable) e.getKey();
				BytesWritable v = (BytesWritable) e.getValue();
				put.addColumn(Bytes.toBytes("ol"), k.getBytes(), v.getBytes());
			}
			// 在Map容器中取出key为il_list的value值，此值的类型是MapWritable，代表了一整个出连接列表
			MapWritable il_map = (MapWritable) map.get(new BytesWritable(Bytes.toBytes("il_list")));
			// 遍历olList,数据插入il 列族中，列名是ilList中的key值，值是ilList的value值
			for (Entry<Writable, Writable> e : il_map.entrySet()) {
				BytesWritable k = (BytesWritable) e.getKey();
				BytesWritable v = (BytesWritable) e.getValue();
				put.addColumn(Bytes.toBytes("il"), k.getBytes(), v.getBytes());
			}
			// put拼完后，进行输出，Reducer输出的key为null，value是put
			context.write(NullWritable.get(), put);
		}
	}
}