前几天遇到了大数据量的上传问题,上传速度一直很慢,看到网上有文章实现了有路由的mr索引程序,所以我也写了一个程序如下:
</pre><p><pre name="code" class="java">public class MapReduceWithRoute extends Configured implements Tool {
/**
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
long start = System.currentTimeMillis();
System.out.println("hello");
ToolRunner.run(new MapReduceWithRoute(), args);
long end = System.currentTimeMillis();
System.out.println("time is:" + ((end - start)));
}
public int run(String args[]) throws Exception {
Job job;
try {
Configuration conf = getConf();
if (conf == null)
conf = new Configuration();
int numReducerTasks = 3;// default
try {
numReducerTasks = Integer.parseInt(args[2]);
} catch (Exception e) {
System.out.println("Exception occurred getting reducers " + e.getMessage());
e.printStackTrace();
}
// conf.set("dfs.replication", "2");
// conf.set("mapred.map.tasks.speculative.execution", "false");
// conf.set("mapred.reduce.tasks.speculative.execution", "false");
// conf.set("mapreduce.job.ubertask.enable", "true");
job = new Job(conf, "mapreducetest");
job.setJarByClass(MapReduceWithRoute.class);
job.setMapperClass(TestMapper.class);
job.setReducerClass(TestReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(solrBean.class);
job.setNumReduceTasks(numReducerTasks);
// job.setPartitionerClass(TestPartition.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.setInputPaths(job, args[0]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
return 0;
}
/**
* map
*
* @author Administrator
*
*/
public static class TestMapper extends Mapper<LongWritable, Text, Text, solrBean> {
String ZK_HOST = "10.254.9.65:2181,10.254.9.66:2181,10.254.9.67:2181";
String APP_ID = "e2e6f6d8d94848cdaa26d9560fc3e791";
private static final Logger LOG = Logger.getLogger(TestMapper.class);
DocCollection docCollection = null;
CloudSolrClient client = null;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
HttpClient httpClient = new DefaultHttpClient();
client = new CloudSolrClient(ZK_HOST, httpClient);
client.setDefaultCollection(APP_ID);
client.connect();
LOG.info("size:" + client.getZkStateReader().getClusterState().getCollections().size());
docCollection = client.getZkStateReader().getClusterState().getCollection(APP_ID);
LOG.info(docCollection.getActiveSlices().size());
}
@Override
public void map(LongWritable key, Text columns, Context context) throws IOException, InterruptedException {
solrBean lineInfo = parseLog(columns);
if (lineInfo != null) {
LOG.info("line info:" + lineInfo.getContent());
String id = lineInfo.getId().toString();
int sliceHash = sliceHash(id);
LOG.info("name1:" + docCollection.getName());
Slice slice = hashToSlice(sliceHash, docCollection);
String shardName = slice.getName();// shard1,shard2 ...
shardName = shardName.substring(5);
LOG.info("map out:" + shardName + " + " + lineInfo);
// 增加10倍的并发
int shardId = Integer.parseInt(shardName);
int part2 = (int) Math.round(Math.random() * 9);
int random = (10 * shardId) + part2;
context.write(new Text(shardId + ""), lineInfo);
}
}
private int sliceHash(String id) {
return Hash.murmurhash3_x86_32(id, 0, id.length(), 0);
}
// copy from org.apache.solr.common.cloud.HashBasedRouter
private Slice hashToSlice(int hash, DocCollection collection) {
LOG.info(collection.getName());
for (Slice slice : collection.getSlices()) {
Range range = slice.getRange();
if (range != null && range.includes(hash))
return slice;
}
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"No slice servicing hash code " + Integer.toHexString(hash) + " in " + collection);
}
private solrBean parseLog(Text line) {
solrBean log = null;
if ((line.toString() != null) && (!line.toString().isEmpty())) {
JSONObject jsonObject = new JSONObject(line.toString());
log = new solrBean();
log.setContent(new Text(jsonObject.get("content").toString()));
log.setPubtime(new Text(jsonObject.get("pubtime").toString()));
log.setSource(new Text(jsonObject.get("source").toString()));
log.setTitle(new Text(jsonObject.get("title").toString()));
String md5 = MD5.getMD5(jsonObject.get("content").toString() + jsonObject.get("pubtime").toString()
+ jsonObject.get("source").toString() + jsonObject.get("title").toString());
log.setId(new Text(md5));
}
return log;
}
}
/**
* reduce
*
* @author Administrator
*
*/
public static class TestReducer extends Reducer<Text, solrBean, Text, Text> {
private static final Logger LOG = Logger.getLogger(TestReducer.class);
String ZK_HOST = "10.254.9.65:2181,10.254.9.66:2181,10.254.9.67:2181";
String APP_ID = "e2e6f6d8d94848cdaa26d9560fc3e791";
CloudSolrClient client = null;
HttpClient httpClient1 = null;
ZkStateReader reader = null;
HashMap<Integer, HttpSolrClient> solrServers = null;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
httpClient1 = new DefaultHttpClient();
client = new CloudSolrClient(ZK_HOST, httpClient1);
client.setDefaultCollection(APP_ID);
client.connect();
reader = client.getZkStateReader();
// CloudSolrClient cloudSolrClient=new
solrServers = new HashMap<Integer, HttpSolrClient>();
try {
for (int i = 1; i <= 3; i++) {
SystemDefaultHttpClient httpClient2 = new SystemDefaultHttpClient();
String url = reader.getLeaderUrl(APP_ID, "shard" + i, 3000);
HttpSolrClient solrClient = new HttpSolrClient(url, httpClient2);
solrServers.put(i, solrClient);
}
} catch (Exception e) {
LOG.info("add solrServers error:" + e.getMessage());
}
}
@Override
public void reduce(Text key, Iterable<solrBean> values, Context context)
throws IOException, InterruptedException {
long start = System.currentTimeMillis();
String partition = key.toString();
LOG.info("key:" + partition);
// 根据partition来启动不同的solrServer
int shardid = Integer.parseInt(partition);
List<SolrInputDocument> doclist = new ArrayList<SolrInputDocument>();
try {
HttpSolrClient solrServer = solrServers.get(shardid);
long end = System.currentTimeMillis();
LOG.info("end:" + (end - start));
int i = 0;
for (solrBean value : values) {
SolrInputDocument doc = new SolrInputDocument();
// LOG.info("id hashcode1:" + doc.hashCode());
doc.setField("id", value.getId().toString());
doc.setField("content", value.getContent().toString());
doc.setField("pubtime", value.getPubtime().toString());
doc.setField("title", value.getTitle().toString());
doc.setField("source", value.getSource().toString());
if (doc != null) {
doclist.add(doc);
i++;
}
}
LOG.info("doclist size:" + doclist.size());
solrServer.add(doclist);
solrServer.commit();
long end1 = System.currentTimeMillis();
LOG.info("end1:" + (end1 - end));
context.write(key, new Text(i + ""));
} catch (Exception e) {
e.printStackTrace();
LOG.info("get solrserver exception:" + e.getMessage());
}
}
}
}
经过测试mr的程序确实要比本地创建索引快,但是实验发现mr上cloudsolrclient的建索引速度也很快,影响建索引速度最主要的原因在分词器的粒度设置,体现出数量级关系的影响。
程序有不对之处请指正,谢谢