solrcloud有路由的mapreduce索引程序

通过MapReduce程序在SolrCloud上构建索引,对比本地创建索引,发现MR程序能显著提高速度。文章指出,CloudSolrClient在索引构建速度上也非常高效。关键影响因素在于分词器的配置,不同的粒度设置会导致速度上的数量级差异。欢迎提出程序优化建议。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

前几天遇到了大数据量的上传问题,上传速度一直很慢,看到网上有文章实现了有路由的mr索引程序,所以我也写了一个程序如下:
</pre><p><pre name="code" class="java">public class MapReduceWithRoute extends Configured implements Tool {

	/**
	 * 
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {

		long start = System.currentTimeMillis();

		System.out.println("hello");

		ToolRunner.run(new MapReduceWithRoute(), args);
		long end = System.currentTimeMillis();
		System.out.println("time is:" + ((end - start)));
	}

	public int run(String args[]) throws Exception {

		Job job;

		try {
			Configuration conf = getConf();

			if (conf == null)
				conf = new Configuration();

			int numReducerTasks = 3;// default

			try {
				numReducerTasks = Integer.parseInt(args[2]);
			} catch (Exception e) {
				System.out.println("Exception occurred getting reducers " + e.getMessage());
				e.printStackTrace();
			}

			// conf.set("dfs.replication", "2");
			// conf.set("mapred.map.tasks.speculative.execution", "false");
			// conf.set("mapred.reduce.tasks.speculative.execution", "false");
			// conf.set("mapreduce.job.ubertask.enable", "true");

			job = new Job(conf, "mapreducetest");

			job.setJarByClass(MapReduceWithRoute.class);
			job.setMapperClass(TestMapper.class);
			job.setReducerClass(TestReducer.class);

			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(solrBean.class);
			job.setNumReduceTasks(numReducerTasks);

			// job.setPartitionerClass(TestPartition.class);

			job.setOutputKeyClass(NullWritable.class);
			job.setOutputValueClass(Text.class);

			job.setInputFormatClass(TextInputFormat.class);

			FileInputFormat.setInputPaths(job, args[0]);
			FileOutputFormat.setOutputPath(job, new Path(args[1]));

			job.waitForCompletion(true);

		} catch (Exception e) {
			e.printStackTrace();
		}

		return 0;
	}

	/**
	 * map
	 * 
	 * @author Administrator
	 * 
	 */
	public static class TestMapper extends Mapper<LongWritable, Text, Text, solrBean> {

		String ZK_HOST = "10.254.9.65:2181,10.254.9.66:2181,10.254.9.67:2181";
		String APP_ID = "e2e6f6d8d94848cdaa26d9560fc3e791";
		private static final Logger LOG = Logger.getLogger(TestMapper.class);
		DocCollection docCollection = null;
		CloudSolrClient client = null;

		@Override
		protected void setup(Context context) throws IOException, InterruptedException {
			HttpClient httpClient = new DefaultHttpClient();
			client = new CloudSolrClient(ZK_HOST, httpClient);
			client.setDefaultCollection(APP_ID);
			client.connect();
			LOG.info("size:" + client.getZkStateReader().getClusterState().getCollections().size());
			docCollection = client.getZkStateReader().getClusterState().getCollection(APP_ID);
			LOG.info(docCollection.getActiveSlices().size());
		}

		@Override
		public void map(LongWritable key, Text columns, Context context) throws IOException, InterruptedException {

			solrBean lineInfo = parseLog(columns);

			if (lineInfo != null) {

				LOG.info("line info:" + lineInfo.getContent());
				String id = lineInfo.getId().toString();
				int sliceHash = sliceHash(id);
				LOG.info("name1:" + docCollection.getName());
				Slice slice = hashToSlice(sliceHash, docCollection);
				String shardName = slice.getName();// shard1,shard2 ...
				shardName = shardName.substring(5);
				LOG.info("map out:" + shardName + " + " + lineInfo);

				// 增加10倍的并发
				int shardId = Integer.parseInt(shardName);
				int part2 = (int) Math.round(Math.random() * 9);
				int random = (10 * shardId) + part2;

				context.write(new Text(shardId + ""), lineInfo);
			}

		}

		private int sliceHash(String id) {
			return Hash.murmurhash3_x86_32(id, 0, id.length(), 0);
		}

		// copy from org.apache.solr.common.cloud.HashBasedRouter
		private Slice hashToSlice(int hash, DocCollection collection) {
			LOG.info(collection.getName());
			for (Slice slice : collection.getSlices()) {
				Range range = slice.getRange();
				if (range != null && range.includes(hash))
					return slice;
			}
			throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
					"No slice servicing hash code " + Integer.toHexString(hash) + " in " + collection);
		}

		private solrBean parseLog(Text line) {
			solrBean log = null;

			if ((line.toString() != null) && (!line.toString().isEmpty())) {

				JSONObject jsonObject = new JSONObject(line.toString());

				log = new solrBean();

				log.setContent(new Text(jsonObject.get("content").toString()));

				log.setPubtime(new Text(jsonObject.get("pubtime").toString()));
				log.setSource(new Text(jsonObject.get("source").toString()));
				log.setTitle(new Text(jsonObject.get("title").toString()));
				String md5 = MD5.getMD5(jsonObject.get("content").toString() + jsonObject.get("pubtime").toString()
						+ jsonObject.get("source").toString() + jsonObject.get("title").toString());
				log.setId(new Text(md5));

			}
			return log;
		}

	}

	/**
	 * reduce
	 * 
	 * @author Administrator
	 * 
	 */
	public static class TestReducer extends Reducer<Text, solrBean, Text, Text> {

		private static final Logger LOG = Logger.getLogger(TestReducer.class);

		String ZK_HOST = "10.254.9.65:2181,10.254.9.66:2181,10.254.9.67:2181";

		String APP_ID = "e2e6f6d8d94848cdaa26d9560fc3e791";

		CloudSolrClient client = null;
		HttpClient httpClient1 = null;
		ZkStateReader reader = null;
		HashMap<Integer, HttpSolrClient> solrServers = null;

		@Override
		protected void setup(Context context) throws IOException, InterruptedException {

			httpClient1 = new DefaultHttpClient();
			client = new CloudSolrClient(ZK_HOST, httpClient1);
			client.setDefaultCollection(APP_ID);
			client.connect();
			reader = client.getZkStateReader();
			// CloudSolrClient cloudSolrClient=new

			solrServers = new HashMap<Integer, HttpSolrClient>();
			try {

				for (int i = 1; i <= 3; i++) {
					SystemDefaultHttpClient httpClient2 = new SystemDefaultHttpClient();
					String url = reader.getLeaderUrl(APP_ID, "shard" + i, 3000);
					HttpSolrClient solrClient = new HttpSolrClient(url, httpClient2);
					solrServers.put(i, solrClient);
				}

			} catch (Exception e) {
				LOG.info("add solrServers error:" + e.getMessage());
			}
		}

		@Override
		public void reduce(Text key, Iterable<solrBean> values, Context context)
				throws IOException, InterruptedException {

			long start = System.currentTimeMillis();

			String partition = key.toString();
			LOG.info("key:" + partition);

			
			// 根据partition来启动不同的solrServer
			int shardid = Integer.parseInt(partition);
			
			List<SolrInputDocument> doclist = new ArrayList<SolrInputDocument>();

			try {

				HttpSolrClient solrServer = solrServers.get(shardid);

				long end = System.currentTimeMillis();

				LOG.info("end:" + (end - start));

				int i = 0;

				for (solrBean value : values) {

					SolrInputDocument doc = new SolrInputDocument();
					// LOG.info("id hashcode1:" + doc.hashCode());
					doc.setField("id", value.getId().toString());
					doc.setField("content", value.getContent().toString());

					doc.setField("pubtime", value.getPubtime().toString());
					doc.setField("title", value.getTitle().toString());
					doc.setField("source", value.getSource().toString());

					if (doc != null) {
						doclist.add(doc);
						i++;
					}
				}

				LOG.info("doclist size:" + doclist.size());

				solrServer.add(doclist);
				solrServer.commit();

				long end1 = System.currentTimeMillis();
				LOG.info("end1:" + (end1 - end));

				context.write(key, new Text(i + ""));
			} catch (Exception e) {
				e.printStackTrace();
				LOG.info("get solrserver exception:" + e.getMessage());
			}

		}

	}

}


经过测试mr的程序确实要比本地创建索引快,但是实验发现mr上cloudsolrclient的建索引速度也很快,影响建索引速度最主要的原因在分词器的粒度设置,体现出数量级关系的影响。





程序有不对之处请指正,谢谢

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值