1:用solrj 的api 调用封装SolrInputDocument 提交solr 服务创建索引
package test.client.impl;
import java.io.IOException;
import java.net.MalformedURLException;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.client.solrj.request.CoreAdminRequest;
import org.apache.solr.common.SolrInputDocument;
public class DBTestDao extends BaseDao {
private static final Logger LOGGER = Logger.getLogger(DBTestDao.class);
private int per = 2000;
private int count = 74000;
private int start = 0;
private int coreCount = 5;
private String targetCore = "http://127.0.0.1:8000/solr/targetCore";
private String url = "http://127.0.0.1:8000/solr/core";
private String indexParentPath = "D:/solr_OB1B/app/solr_multcore/core";
static {
PropertyConfigurator.configure("log4j.properties");
}
private void removeOtherIndex() throws SolrServerException, IOException {
for (int i = 0; i < coreCount; i++) {
CommonsHttpSolrServer server = new CommonsHttpSolrServer(url + i);
server.deleteByQuery("*:*");
server.optimize();
server.commit();
}
}
private void removeTargetIndex() {
try {
CommonsHttpSolrServer server = new CommonsHttpSolrServer(targetCore);
server.deleteByQuery("*:*");
server.optimize();
server.commit();
LOGGER.info("清空targetCore的数据");
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SolrServerException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private List<SolrInputDocument> getDocList() {
String sql = "select num,metaid,active,productname,catalogid,url,price,count_productmeta,minimum_price,maximum_price,catalogid1,catalogname1,catalogid2,catalogname2,catalogid3,catalogname3,brandid,brandname, imagepath ,[rank],[description],displayname,updatetime,createdatetime,isfirstproduct,commoncount,type from DataCenter_TempTable as t where t.num>"
+ start + " AND t.num<" + (start + per);
List<SolrInputDocument> list = new ArrayList<SolrInputDocument>();
try {
Statement stat = conn.createStatement();
LOGGER.info(sql);
ResultSet rs = stat.executeQuery(sql);
while (rs.next()) {
SolrInputDocument doc = new SolrInputDocument();
doc.addField("metaid", rs.getObject("metaid"));
doc.addField("id", rs.getObject("num"));
//doc.addField("price", rs.getObject("price"));
doc.addField("url", rs.getObject("url"));
doc.addField("count_productmeta", rs.getObject("count_productmeta"));
doc.addField("minimum_price", rs.getObject("minimum_price"));
doc.addField("maximum_price", rs.getObject("maximum_price"));
doc.addField("rank", rs.getObject("rank"));
doc.addField("imagepath", rs.getObject("imagepath"));
doc.addField("displayname", rs.getObject("displayname"));
doc.addField("description", rs.getObject("description"));
doc.addField("brandfacet", rs.getObject("num"));
doc.addField("brandname", rs.getObject("brandname"));
doc.addField("productname", rs.getObject("productname"));
doc.addField("catalogid3", rs.getObject("metaid"));
doc.addField("catalogfacet3", rs.getObject("catalogname3"));
doc.addField("catalogid2", rs.getObject("metaid"));
doc.addField("catalogfacet2", rs.getObject("catalogname2"));
doc.addField("catalogid1", rs.getObject("metaid"));
doc.addField("catalogfacet1", rs.getObject("catalogname1"));
doc.addField("updatetime", rs.getObject("updatetime"));
doc.addField("createdatetime", rs.getObject("createdatetime"));
doc.addField("isfirstproduct", rs.getObject("isfirstproduct"));
doc.addField("catalogid", rs.getObject("catalogid"));
doc.addField("commoncount", rs.getObject("commoncount"));
String searchText = rs.getString("brandname") + rs.getString("catalogname3")
+ rs.getString("productname") + rs.getInt("catalogid1") + rs.getInt("catalogid2")
+ rs.getInt("catalogid3");
doc.addField("searchText", searchText);
list.add(doc);
}
start = start + per;
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
public void startCoreIndex(int core) throws SolrServerException, IOException {
String solrUrl = url + core;
List<SolrInputDocument> list = getDocList();
CommonsHttpSolrServer server = new CommonsHttpSolrServer(solrUrl);
server.add(list);
LOGGER.info(".........开始为core" + core + "提交数据!!!数据开始【" + (start - 2000) + "】结束【" + start + "】");
server.commit();
LOGGER.info(".........core" + core + "数据提交结束");
}
public void mergeIndex() throws SolrServerException, IOException {
for (int i = 0; i < coreCount; i++) {
LOGGER.info("开始合并:core" + i);
String[] strs = { indexParentPath + i + "/data/index" };
CoreAdminRequest.mergeIndexes("targetCore", strs, new CommonsHttpSolrServer("http://127.0.0.1:8000/solr"));
LOGGER.info("core" + i + "合并完成.....");
}
String targetCore = this.targetCore;
CommonsHttpSolrServer targetServer = new CommonsHttpSolrServer(targetCore);
targetServer.commit();
targetServer.optimize();
LOGGER.info("所有 core合并完成.....");
}
public void process() throws SolrServerException, IOException {
removeTargetIndex();
long startDate = new Date().getTime();
while (start < count) {
// 1.调用每一个core开始创建索引
for (int i = 0; i < coreCount; i++) {
if (start < count) {
startCoreIndex(i);
}
}
}
LOGGER.info("数据索引完成");
mergeIndex();
long endDate = new Date().getTime();
System.out.println("~~~~~~~~~~~~~~~~~~~" + (endDate - startDate));
removeOtherIndex();
}
public static void main(String[] args) {
DBTestDao dao = new DBTestDao();
try {
dao.process();
} catch (SolrServerException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
package test.client.impl; import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; public class BaseDao { private static final Logger LOGGER = Logger.getLogger(BaseDao.class); private static String url = "jdbc:sqlserver://10.16.230.40:1433;DatabaseName=CrawlerDataCenter_Zol_Test"; private static String password = "xalab@123"; private static String username = "sa"; protected static Connection conn; static { try { PropertyConfigurator.configure("log4j.properties"); Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver"); conn = DriverManager.getConnection(url, username, password); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public BaseDao() { } public static void main(String[] args) { BaseDao dao = new BaseDao(); } }
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="example" version="1.2">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<fieldtype name="textZN" class="solr.TextField">
<analyzer class="org.wltea.analyzer.lucene.IKAnalyzer"/>
<analyzer type="index">
<tokenizer class="org.wltea.analyzer.solr.IKTokenizerFactory" isMaxWordLength="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
</analyzer>
</fieldtype>
</types>
<fields>
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="itemPrice" type="float" indexed="true" stored="true" />
<field name="metaid" type="int" indexed="true" stored="true" />
<field name="imagepath" type="string" indexed="true" stored="true" />
<field name="brandid" type="int" indexed="true" stored="false" />
<field name="brandfacet" type="string" indexed="true" stored="true" />
<field name="brandname" type="string" indexed="true" stored="true" />
<field name="url" type="string" indexed="true" stored="true" />
<field name="count_productmeta" type="string" indexed="true" stored="true" />
<field name="minimum_price" type="float" indexed="true" stored="true" />
<field name="maximum_price" type="float" indexed="true" stored="true" />
<field name="rank" type="int" indexed="true" stored="true" />
<field name="displayname" type="string" indexed="true" stored="true" />
<field name="description" type="textZN" indexed="true" stored="true" />
<field name="pid" type="int" indexed="true" stored="false" multiValued="true" />
<field name="propFacet" type="string" indexed="true" stored="true" multiValued="true" />
<field name="productname" type="string" indexed="true" stored="true" />
<field name="catalogid1" type="int" indexed="true" stored="false" />
<field name="catalogfacet1" type="string" indexed="true" stored="true" />
<field name="catalogid2" type="int" indexed="true" stored="false" />
<field name="catalogfacet2" type="string" indexed="true" stored="true" />
<field name="catalogid3" type="int" indexed="true" stored="false" />
<field name="catalogfacet3" type="string" indexed="true" stored="true" />
<field name="updatetime" type="string" indexed="true" stored="true" />
<field name="createdatetime" type="string" indexed="true" stored="true" />
<field name="isfirstproduct" type="int" indexed="true" stored="true" />
<field name="catalogid" type="int" indexed="true" stored="true" />
<field name="commoncount" type="int" indexed="true" stored="true" />
<field name="catalogtype" type="string" indexed="true" stored="true" />
<field name="autocompleteword" type="textZN" indexed="true" stored="true" />
<field name="searchText" type="textZN" indexed="true" stored="true" />
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
RESTRICTION: the glob-like pattern in the name attribute must have
a "*" only at the start or the end.
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
Longer patterns will be matched first. if equal size patterns
both match, the first appearing in the schema will be used. -->
<dynamicField name="*_i" type="int" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>searchText</defaultSearchField>
<solrQueryParser defaultOperator="AND"/>
<!--
<copyField source="itemName" dest="searchText" />
<copyField source="itemBrand" dest="searchText" />
<copyField source="itemCatalog" dest="searchText" />
<copyField source="itemDesc" dest="searchText" />
-->
</schema>
2:通过data-config.xml 配置进行数据导入
import java.io.File; import java.io.IOException; import java.util.Date; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer; import org.apache.solr.client.solrj.request.CoreAdminRequest; import org.apache.solr.client.solrj.request.QueryRequest; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; import com.newegg.labxa.ob1b.search.tools.OpertProperty; import com.newegg.labxa.ob1b.search.tools.SolrServiceFactory; public class SolrCreateIndex { private static final Logger LOGGER = Logger.getLogger(SolrCreateIndex.class); private static final OpertProperty OPT = OpertProperty.getInstance(); private static final String TARGET_CORE_NAME = OPT.getValue(OpertProperty.TARGET_CORE); private static String createSolrUrl = OPT.getValue(OpertProperty.CREATE_SOLR_URL); private static int start = 1; private static int size = 1000; static { size = OPT.getValueForInteger(OpertProperty.CORE_DATA_NUM); if (!createSolrUrl.endsWith("/")) { createSolrUrl = createSolrUrl + "/"; } PropertyConfigurator.configure("log4j.properties"); } /** * multicore创建索引的入口函数 */ public static void createIndex(int totalNum) { //在索引要全部建之前,先清空targetCore里边旧索引 deleteCoreIndex(new String[] { TARGET_CORE_NAME }); String[] cores = OPT.getValueForStringArray(OpertProperty.SOLR_CORES); if (cores == null || cores.length == 0) { return; } long startDate = new Date().getTime(); while (start < totalNum) { // 1.调用每一个core开始创建索引 for (int i = 0; i < cores.length; i++) { if (start < totalNum) { startOneCoreCreate(createSolrUrl + cores[i] + "/"); } } } //2.merging(合并每一个core下面的索引到targetCore里) try { mergingIndex(cores, TARGET_CORE_NAME); long endDate = new Date().getTime(); System.out.println("~~~~~~~~~~~~~~~~~~~" + (endDate - startDate)); } catch (InterruptedException e) { if (LOGGER.isInfoEnabled()) { LOGGER.info(e); } } catch (Exception e) { if (LOGGER.isInfoEnabled()) { LOGGER.info(e); } } // 3.删除core0,core1,core2...下面的index deleteCoreIndex(cores); } /** * 启动每一个core实现对index的创建 */ private static void startOneCoreCreate(String url) { CommonsHttpSolrServer server = SolrServiceFactory.getSolrServer(url); // 判断当前的core的index是否创建完成 boolean indexIsOk = createIndexIsOk(server); if (!indexIsOk) { return; } ModifiableSolrParams params = new ModifiableSolrParams(); params.add("qt", "/dataimport"); params.add("clean", "false"); params.add("command", "full-import"); params.add("commit", "true"); params.add("startid", String.valueOf(start)); params.add("endid", String.valueOf(start + size)); try { start = start + size; // 修改下标 server.query(params); server.commit(); } catch (SolrServerException e) { if (LOGGER.isInfoEnabled()) { LOGGER.info(e); } start = start - size; // 回滚修改的下标 } catch (IOException e) { if (LOGGER.isInfoEnabled()) { LOGGER.info(e); } start = start - size; // 回滚修改的下标 } LOGGER.info("创建索引:" + url + "start=" + (start - size) + " end=" + start); } /** * 索引的合并 */ private static void mergingIndex(String[] cores, String targetCoreName) throws Exception { String indexParentPath = OPT.getValue(OpertProperty.INDEX_PARENT_PATH); if (!indexParentPath.endsWith("/")) { indexParentPath = indexParentPath + "/"; } CommonsHttpSolrServer server = SolrServiceFactory.getSolrServer(createSolrUrl); for (String coreName : cores) { LOGGER.info("开始合并:" + coreName); String indexPath = indexParentPath + coreName + "/data/index"; File dir = new File(indexPath); if (!dir.exists() && !dir.isDirectory()) { throw new RuntimeException(indexPath); } String[] strs = { indexParentPath + coreName + "/data/index" }; /* --------------------判断core的索引是否创建完成----------------------------- */ String oneCoreUrl = createSolrUrl + coreName + "/"; CommonsHttpSolrServer oneCoreServer = SolrServiceFactory.getSolrServer(oneCoreUrl); int i = 0; while (i < 500) { boolean isOver = createIndexIsOk(oneCoreServer); if (isOver) { break; } // 防止死循环!!! i++; } /* --------------------判断core的索引是否创建完成----------------------------- */ LOGGER.info(targetCoreName); for (String s : strs) { LOGGER.info(s); } LOGGER.info(createSolrUrl); CoreAdminRequest.mergeIndexes(targetCoreName, strs, server); LOGGER.info(coreName + "合并完成....."); } String targetCore = createSolrUrl + targetCoreName; CommonsHttpSolrServer targetServer = SolrServiceFactory.getSolrServer(targetCore); targetServer.commit(); targetServer.optimize(); LOGGER.info("索引全部合并完成...."); } /** * 判断core是是否已经创建完成 * (通过给http:127.0.0.1:8080/solr/colrxx/dataimport来获得DIH线程的状态是'busy'还是'idle') */ private static boolean createIndexIsOk(CommonsHttpSolrServer oneCoreServer) { // 防止判断core状态时刷新太快,所以每次判断时先sleep 2s try { Thread.sleep(2 * 1000); } catch (Exception e) { if (LOGGER.isInfoEnabled()) { LOGGER.info(e); } } System.out.println("*************************************************"); ModifiableSolrParams params = new ModifiableSolrParams(); params.add("qt", "/dataimport"); SolrRequest request = new QueryRequest(params); try { NamedList<Object> names = oneCoreServer.request(request); String diHStart = (String) names.get("status"); LOGGER.info("core的状况:" + oneCoreServer.getBaseURL() + " :" + diHStart); if (diHStart != null && "idle".equalsIgnoreCase(diHStart)) { return true; } } catch (SolrServerException e) { if (LOGGER.isInfoEnabled()) { LOGGER.info(e); } } catch (IOException e) { if (LOGGER.isInfoEnabled()) { LOGGER.info(e); } } return false; } /** * 删除core下面的index * * @param cores */ private static void deleteCoreIndex(String[] cores) { for (String core : cores) { String coreUrl = createSolrUrl + core; CommonsHttpSolrServer coreServer = SolrServiceFactory.getSolrServer(coreUrl); try { coreServer.deleteByQuery("*:*"); coreServer.optimize(); coreServer.commit(); if (LOGGER.isInfoEnabled()) { LOGGER.info("删除" + core + "上的索引完成"); } } catch (SolrServerException e) { if (LOGGER.isInfoEnabled()) { LOGGER.info(e); } } catch (IOException e) { if (LOGGER.isInfoEnabled()) { LOGGER.info(e); } } } } /** * 对targetCore只执行delta-import,但不进行优化,这样就在replication时只把update的index发给slave机 */ public static void deltaIndex() { String targetCoreUrl = createSolrUrl + TARGET_CORE_NAME; CommonsHttpSolrServer server = SolrServiceFactory.getSolrServer(targetCoreUrl); ModifiableSolrParams params = new ModifiableSolrParams(); params.add("qt", "/dataimport"); params.add("clean", "false"); params.add("command", "delta-import"); params.add("commit", "true"); // 这里增量的索引不进行合并,这样就每次给slave机分发新增的部分 try { server.query(params); server.commit(); } catch (SolrServerException e) { if (LOGGER.isInfoEnabled()) { LOGGER.info(e); } } catch (IOException e) { if (LOGGER.isInfoEnabled()) { LOGGER.info(e); } } LOGGER.info("更新索引:" + targetCoreUrl); } public static void main(String[] args) { //deleteCoreIndex(new String[] { "targetCore" }); createIndex(74000); //deltaIndex(); } }
Sql 总记录 74000 条
方式一:
data-config.xml 配置方式索引
索引 field 31 个
core数量 5个
生成 index 文件 141MB
共耗时 93061ms
平均 0.001515MB/ms
方式二:
Api SolrInputDocument 方式提交
索引 field 25 个
core数量 5个
生成 index 文件 113MB
共耗时 66061ms
平均 0.001710MB/ms