热更新
每次都是在es的扩展词典中,手动添加新词语,很坑
(1)每次添加完,都要重启es才能生效,非常麻烦
(2)es是分布式的,可能有数百个节点,你不能每次都一个一个节点上面去修改
es不停机,直接我们在外部某个地方添加新的词语,es中立即热加载到这些新词语
热更新的方案
(1)修改ik分词器源码,然后手动支持从mysql中每隔一定时间,自动加载新的词库
(2)基于ik分词器原生支持的热更新方案,部署一个web服务器,提供一个http接口,通过modified和tag两个http响应头,来提供词语的热更新
用第一种方案,第二种,ik git社区官方都不建议采用,觉得不太稳定
1、下载源码
https://github.com/medcl/elasticsearch-analysis-ik/tree/v5.2.0
ik分词器,是个标准的java maven工程,直接导入eclipse就可以看到源码
2、修改源码
Dictionary类,169行:Dictionary单例类的初始化方法,在这里需要创建一个我们自定义的线程,并且启动它
/**
* 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
* 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段
*
* @return Dictionary
*/
public static synchronized Dictionary initial(Configuration cfg) {
if (singleton == null) {
synchronized (Dictionary.class) {
if (singleton == null) {
singleton = new Dictionary(cfg);
singleton .loadMainDict();
singleton .loadSurnameDict();
singleton .loadQuantifierDict();
singleton .loadSuffixDict();
singleton .loadPrepDict();
singleton .loadStopWordDict();
new Thread(new HotDictReloadThread()).start();
if(cfg.isEnableRemoteDict()){
// 建立监控线程
for (String location : singleton .getRemoteExtDictionarys()) {
// 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
pool .scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS );
}
for (String location : singleton .getRemoteExtStopWordDictionarys()) {
pool .scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS );
}
}
return singleton ;
}
}
}
return singleton ;
}
HotDictReloadThread类:就是死循环,不断调用Dictionary.getSingleton().reLoadMainDict(),去重新加载词典
package org.wltea.analyzer.dic;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.common.logging.ESLoggerFactory;
public class HotDictReloadThread implements Runnable {
private static final Logger logger = ESLoggerFactory.getLogger (HotDictReloadThread.class.getName());
@Override
public void run() {
while(true) {
logger .info("[==========]reload hot dict from mysql......");
Dictionary.getSingleton ().reLoadMainDict();
}
}
}
Dictionary类,389行:this.loadMySQLExtDict();
/**
* 加载主词典及扩展词典
*/
private void loadMainDict() {
// 建立一个主词典实例
_MainDict = new DictSegment((char) 0);
// 读取主词典文件
Path file = PathUtils.get (getDictRoot(), Dictionary.PATH_DIC_MAIN );
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger .error(e.getMessage(), e);
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_MainDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException e) {
logger .error("ik-analyzer", e);
} finally {
try {
if (is != null) {
is.close();
is = null;
}
} catch (IOException e) {
logger .error("ik-analyzer", e);
}
}
// 加载扩展词典
this.loadExtDict();
// 加载远程自定义词库
this.loadRemoteExtDict(); // 从mysql加载词典
this.loadMySQLExtDict();
}
/**
* 从mysql加载热更新词典
*/
private void loadMySQLExtDict() {
Connection conn = null;
Statement stmt = null;
ResultSet rs = null;
try {
Path file = PathUtils.get (getDictRoot(), "jdbc-reload.properties");
prop .load(new FileInputStream(file.toFile()));
logger .info("[==========]jdbc-reload.properties");
for(Object key : prop .keySet()) {
logger .info("[==========]" + key + "=" + prop .getProperty(String.valueOf (key)));
}
logger .info("[==========]query hot dict from mysql, " + prop .getProperty("jdbc.reload.sql") + "......");
conn = DriverManager.getConnection (
prop .getProperty("jdbc.url"),
prop .getProperty("jdbc.user"),
prop .getProperty("jdbc.password"));
stmt = conn.createStatement();
rs = stmt.executeQuery(prop .getProperty("jdbc.reload.sql"));
while(rs.next()) {
String theWord = rs.getString("word");
logger .info("[==========]hot word from mysql: " + theWord);
_MainDict.fillSegment(theWord.trim().toCharArray());
}
Thread.sleep (Integer.valueOf (String.valueOf (prop .get("jdbc.reload.interval"))));
} catch (Exception e) {
logger .error("erorr", e);
} finally {
if(rs != null) {
try {
rs.close();
} catch (SQLException e) {
logger .error("error", e);
}
}
if(stmt != null) {
try {
stmt.close();
} catch (SQLException e) {
logger .error("error", e);
}
}
if(conn != null) {
try {
conn.close();
} catch (SQLException e) {
logger .error("error", e);
}
}
}
}
Dictionary类,683行:this.loadMySQLStopwordDict();
/**
* 加载用户扩展的停止词词典
*/
private void loadStopWordDict() {
// 建立主词典实例
_StopWords = new DictSegment((char) 0);
// 读取主词典文件
Path file = PathUtils.get (getDictRoot(), Dictionary.PATH_DIC_STOP );
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger .error(e.getMessage(), e);
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_StopWords.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException e) {
logger .error("ik-analyzer", e);
} finally {
try {
if (is != null) {
is.close();
is = null;
}
} catch (IOException e) {
logger .error("ik-analyzer", e);
}
}
// 加载扩展停止词典
List<String> extStopWordDictFiles = getExtStopWordDictionarys();
if (extStopWordDictFiles != null) {
is = null;
for (String extStopWordDictName : extStopWordDictFiles) {
logger .info("[Dict Loading] " + extStopWordDictName);
// 读取扩展词典文件
file = PathUtils.get (getDictRoot(), extStopWordDictName);
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger .error("ik-analyzer", e);
}
// 如果找不到扩展的字典,则忽略
if (is == null) {
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
// 加载扩展停止词典数据到内存中
_StopWords.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException e) {
logger .error("ik-analyzer", e);
} finally {
try {
if (is != null) {
is.close();
is = null;
}
} catch (IOException e) {
logger .error("ik-analyzer", e);
}
}
}
}
// 加载远程停用词典
List<String> remoteExtStopWordDictFiles = getRemoteExtStopWordDictionarys();
for (String location : remoteExtStopWordDictFiles) {
logger .info("[Dict Loading] " + location);
List<String> lists = getRemoteWords (location);
// 如果找不到扩展的字典,则忽略
if (lists == null) {
logger .error("[Dict Loading] " + location + "加载失败");
continue;
}
for (String theWord : lists) {
if (theWord != null && !"".equals(theWord.trim())) {
// 加载远程词典数据到主内存中
logger .info(theWord);
_StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
}
}
this.loadMySQLStopwordDict();
}
/**
* 从mysql加载停用词
*/
private void loadMySQLStopwordDict() {
Connection conn = null;
Statement stmt = null;
ResultSet rs = null;
try {
Path file = PathUtils.get (getDictRoot(), "jdbc-reload.properties");
prop .load(new FileInputStream(file.toFile()));
logger .info("[==========]jdbc-reload.properties");
for(Object key : prop .keySet()) {
logger .info("[==========]" + key + "=" + prop .getProperty(String.valueOf (key)));
}
logger .info("[==========]query hot stopword dict from mysql, " + prop .getProperty("jdbc.reload.stopword.sql") + "......");
conn = DriverManager.getConnection (
prop .getProperty("jdbc.url"),
prop .getProperty("jdbc.user"),
prop .getProperty("jdbc.password"));
stmt = conn.createStatement();
rs = stmt.executeQuery(prop .getProperty("jdbc.reload.stopword.sql"));
while(rs.next()) {
String theWord = rs.getString("word");
logger .info("[==========]hot stopword from mysql: " + theWord);
_StopWords.fillSegment(theWord.trim().toCharArray());
}
Thread.sleep (Integer.valueOf (String.valueOf (prop .get("jdbc.reload.interval"))));
} catch (Exception e) {
logger .error("erorr", e);
} finally {
if(rs != null) {
try {
rs.close();
} catch (SQLException e) {
logger .error("error", e);
}
}
if(stmt != null) {
try {
stmt.close();
} catch (SQLException e) {
logger .error("error", e);
}
}
if(conn != null) {
try {
conn.close();
} catch (SQLException e) {
logger .error("error", e);
}
}
}
}
3、mvn package打包代码
target\releases\elasticsearch-analysis-ik-5.2.0.zip
4、解压缩ik压缩包
在elasticsearch-5.2.0\plugins\ik 对压缩文件进行解压缩
将mysql驱动jar,放入ik的目录下
5、修改jdbc相关配置
6、重启es
观察日志,日志中就会显示我们打印的那些东西,比如加载了什么配置,加载了什么词语,什么停用词
7 测试,mysql添加前
测试分词
GET _analyze
{
"text": "一人饮酒",
"analyzer": "ik_max_word"
}
结果:
{
"tokens": [
{
"token": "一人",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 0
},
{
"token": "一",
"start_offset": 0,
"end_offset": 1,
"type": "TYPE_CNUM",
"position": 1
},
{
"token": "人",
"start_offset": 1,
"end_offset": 2,
"type": "COUNT",
"position": 2
},
{
"token": "饮酒",
"start_offset": 2,
"end_offset": 4,
"type": "CN_WORD",
"position": 3
},
{
"token": "饮",
"start_offset": 2,
"end_offset": 3,
"type": "CN_WORD",
"position": 4
},
{
"token": "酒",
"start_offset": 3,
"end_offset": 4,
"type": "CN_WORD",
"position": 5
}
]
}
次数停用词
8在mysql中添加词库与停用词
添加分词
添加停用词
一看日志已经加载出来
9分词实验,验证热更新生效
分词实验
停用词实验