基于规则的自动分词算法
原理
(1) 事先人工建立好分词词典和分词规则库。
(2) 原理为基于字符串匹配进行分词,这样就要求有足够大的词表为依据。
(3) 通过一定的算法来实现,如正向最大匹配法、逆向最大匹配法、双向匹配法等。
(4) 忧缺点:当分词词典所收容的词较少时,显然覆盖度就有限,分词的正确率就低。
正向最大匹配法
算法描述
设MaxLen表示最大词长,D为分词词典
(1) 从待切分语料中按正向取长度为MaxLen的字串str,令Len=MaxLen;
(2) 把str与D中的词相匹配;
(3) 若匹配成功,则认为该字串为词,指向待切分语料的指针向前移Len个汉字(字节),返回到(1);
(4) 若不成功:如果Len>1,则将Len减2,从待切分语料中取长度为Len的字串str,返回到(2)。否则,得到长度为2的单字词,指向待切分语料的指针向前移1个汉字,返回(1)。
算法代码
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nlp;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
*
* @author quincy1994
*/
public class Nlp {
private String m_sResult = ""; // 切分后的结果串
private int m_nPosIndex; // 指向待切分语料的指针的具体位置
private int m_MaxLen; // 最大取词长
private int totalMaxLen; //总最大取词长
private Set<String> dictionary; // 分词字典
public Nlp(int maxLen){
this.m_MaxLen = maxLen;
this.m_nPosIndex = 0;
this.totalMaxLen = maxLen;
try {
this.dictionary = this.loadFile();
} catch (IOException ex) {
Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex);
}
}
public Nlp(){
this.m_MaxLen = 3;
this.totalMaxLen = 3;
this.m_nPosIndex = 0;
try {
this.dictionary = this.loadFile();
} catch (IOException ex) {
Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex);
}
}
public Set<String> loadFile() throws FileNotFoundException, IOException{
//读取字典
Set<String> dictionary = new HashSet<String>();
String filename = "dict.txt";
BufferedReader br = new BufferedReader(new FileReader(filename));
String tmp;
while( ( tmp = br.readLine() )!=null){
String[] token = tmp.split(",");
String word = token[0];
dictionary.add(word);
}
return dictionary;
}
public String MMSegment(String source){
int len = totalMaxLen;
int frompos = 0;
MM(source, len, frompos);
return m_sResult;
}
public String getSubString(String source, int m_nPosIndex, int len){
int endIndex = m_nPosIndex + len;
int length = source.length();
//需要判断是否超出句子边界
while(endIndex > length){
endIndex -= 1;
}
String sub = source.substring(m_nPosIndex, endIndex);
return sub;
}
public void MM(String source, int len , int frompos){
//递归匹配
if (m_nPosIndex >= source.length()) return;
String sub = getSubString(source, m_nPosIndex,len);
if(dictionary.contains(sub)){
//匹配
m_sResult += sub + "/ ";
m_nPosIndex = m_nPosIndex + m_MaxLen;
m_MaxLen = totalMaxLen;
MM(source, m_MaxLen, m_nPosIndex);
}
else{
//不匹配
if(m_MaxLen > 1){
m_MaxLen = m_MaxLen - 1;
MM(source, m_MaxLen, m_nPosIndex);
}
else{
m_sResult += sub+ "/ ";
m_nPosIndex += 1;
m_MaxLen = totalMaxLen;
MM(source, m_MaxLen, m_nPosIndex);
}
}
}
/**
* @param args the command line arguments
*/
public static void main(String[] args) {
// TODO code application logic here
Nlp nlp = new Nlp();
String source = "今天天气不错!";
String result = nlp.MMSegment(source);
System.out.println(result);
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
逆向最大匹配法
算法描述
与正向最大匹配法原理一样,只是匹配的开始为句尾
代码实现
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nlp;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
*
* @author quincy1994
*/
public class RMM {
private String m_sResult = ""; //切分后的结果串
private int m_nPosIndex; //游标指针
private int m_MaxLen; //最大取词长
private int totalMaxlen; //总最大取词长
private Set<String> dictionary; //分词字典
public RMM(int maxLen){
this.m_MaxLen = maxLen;
this.totalMaxlen = maxLen;
try {
this.dictionary = loadFile();
} catch (IOException ex) {
Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
}
}
public RMM(){
this.m_MaxLen = 3;
this.totalMaxlen = 3;
try {
this.dictionary = loadFile();
} catch (IOException ex) {
Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
}
}
public Set<String> loadFile() throws IOException{
//读取字典
Set<String> dictionary = new HashSet<String>();
String filename = "dict.txt";
BufferedReader br = new BufferedReader(new FileReader(filename));
String tmp;
while((tmp=br.readLine())!= null){
String[] token = tmp.split(",");
String word = token[0];
dictionary.add(word);
}
return dictionary;
}
public String RMMSegment(String source){
int len= totalMaxlen;
this.m_nPosIndex = source.length();
int frompos = this.m_nPosIndex;
rmm(source, m_MaxLen, m_nPosIndex);
//将结果按顺序输出
String[] token = m_sResult.split("/");
String result = "";
for(int i = token.length-1; i > 0 ; i--){
result += token[i] + "/ ";
}
return result;
}
public String getSubString(String source, int m_nPosIndex, int len){
int startIndex = m_nPosIndex - len;
//判断越界条件
while(startIndex < 0){
startIndex += 1;
}
String sub = source.substring(startIndex, m_nPosIndex);
return sub;
}
public void rmm(String source, int len, int frompos){
if(m_nPosIndex < 0) return;
String sub = getSubString(source, m_nPosIndex, len);
if(dictionary.contains(sub)){
//匹配成功
m_sResult += "/" + sub ;
m_nPosIndex = m_nPosIndex - m_MaxLen;
m_MaxLen = totalMaxlen;
rmm(source, m_MaxLen, m_nPosIndex);
}
else{
//不匹配
if(m_MaxLen > 1){
m_MaxLen = m_MaxLen - 1;
rmm(source, m_MaxLen, m_nPosIndex);
}
else{
m_sResult += "/" + sub ;
m_nPosIndex -= 1;
m_MaxLen = totalMaxlen;
rmm(source, m_MaxLen, m_nPosIndex);
}
}
}
public static void main(String[] args) {
// TODO code application logic here
RMM myRMM = new RMM();
String source = "记录最佳前候选词列表";
String result = myRMM.RMMSegment(source);
System.out.println(result);
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
基于统计的中文分词算法
基本思想
选择概率最大的分词路径作为最优结果
利用动态规划算法来实现,即最优路径中的第i个词w i 的累计概率等于它的左相邻词w i-1 的累积概率乘以w i 自身的概率
具体算法
(1)对一个待分词的字串S,按照从左到右的顺序取出全部候选词w 1 ,w 2 ,…,w i ,w n ;
(2)计算每个候选词的概率值P(w i ),记录每个候选词的全部左邻词;
(3)计算每个候选词的累计概率,累计概率最大的候选词为最佳左邻词;
如果当前词w n 是字串的尾词,且累计概率P’(w n )最大,则w n 是S的终点词;
(4)从w n 开始,按照从右到左顺序,依次将每个词的最佳左邻词输出,即S的分词结果.
字典树
又称单词查找树,Trie树,是一种树形结构,是一种哈希树的变种。典型应用是用于统计,排序和保存大量的字符串(但不仅限于字符串),所以经常被搜索引擎系统用于文本词频统计。它的优点是:利用字符串的公共前缀来减少查询时间,最大限度地减少无谓的字符串比较,查询效率比哈希树高。
字典树的代码实现
主要参考:http://blog.youkuaiyun.com/sadfishsc/article/details/9152647
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nlp;
import java.util.HashMap;
import java.util.Map;
/**
*
* @author quincy1994
*/
public class TireNode {
private String character; // 单个汉字
private int frequency = -1; // 词频, -1来区别某条路径上的字串是否是一个词组
private double antilog = -1; // 对数化的词频
private Map<String, TireNode> children; //下一个节点
public String getCharacter(){
return character;
}
public void setCharacter(String character){
this.character = character;
}
public int getFrequency(){
return frequency;
}
public void setFrequency(int frequency){
this.frequency = frequency;
}
public double getAntilog(){
return antilog;
}
public void setAntilog(double antilog){
this.antilog = antilog;
}
public void addChild(TireNode node){
if (children == null){
children = new HashMap<String, TireNode>();
}
if (!children.containsKey(node.getCharacter())){
children.put(node.getCharacter(), node);
}
}
public TireNode getChild(String ch){
if (children == null || ! children.containsKey(ch)){
return null;
}
return children.get(ch);
}
public void removeChildren(String ch){
if (children == null || !children.containsKey(ch)){
return;
}
children.remove(ch);
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
算法实现
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nlp;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author quincy1994
*/
public class ChnSeq {
private TireNode tire = null;
public List<String> loadFile() throws FileNotFoundException, IOException {
//读取字典
List<String> lines = new ArrayList<String>();
String filename = "wordFre.txt";
BufferedReader br = new BufferedReader(new FileReader(filename));
String tmp;
while ((tmp = br.readLine()) != null) {
lines.add(tmp);
}
br.close();
return lines;
}
public void init() throws IOException {
List<String> lines = loadFile();
tire = new TireNode();
for (String line : lines) {
String[] tokens = line.split(",");
String word = tokens[0];
int freq = Integer.parseInt(tokens[1]);
double antilog = Math.log(1+0.01/Double.parseDouble(tokens[2].replace("%", ""))) ;
//构建词典树
TireNode root = tire;
for (int i = 0; i < word.length(); i++) {
String c = "" + word.charAt(i);
TireNode node = root.getChild(c);
if (node == null) {
node = new TireNode();
node.setCharacter(c);
root.addChild(node);
}
root = node;
}
root.setFrequency(freq); //为每个词设立词频
root.setAntilog(antilog); //为每个词设立逆文档频率
}
}
public TireNode getTire() {
return tire;
}
public TireNode getNodeByWord(String word) {
TireNode node = tire;
for (int i = 0; i < word.length(); i++) {
String ch = word.charAt(i) + "";
if (node == null) {
break;
} else {
node = node.getChild(ch);
}
}
return node;
}
private class Segment {
public String word; //词
public String endChar; //结束词
public String lastChar; //前缀词
public double cost;
public final static String START_SIGN = "<< STARTING >>";
public final static String END_SIGN = "<< ENDING >>";
}
//寻找候选词
public List<Segment> preSegment(String sentence) {
List<Segment> segs = new ArrayList<Segment>();
//设置句子的开始标志
Segment terminal = new Segment();
terminal.word = Segment.START_SIGN;
terminal.endChar = Segment.START_SIGN;
terminal.lastChar = null;
segs.add(terminal);
for (int i = 0; i < sentence.length(); i++) {
for (int j = i + 1; j <= sentence.length(); j++) {
String word = sentence.substring(i, j);
TireNode tnode = this.getNodeByWord(word);
if (tnode == null) {
break;
}
if (tnode.getFrequency() <= 0) {
continue;
}
Segment seg = new Segment();
seg.word = word;
seg.endChar = word.substring(word.length() - 1, word.length());
if (i == 0) {
seg.lastChar = Segment.START_SIGN;
} else {
seg.lastChar = sentence.substring(i - 1, i);
}
seg.cost = tnode.getAntilog();
System.out.println(word + " " + seg.cost +" " + tnode.getFrequency());
segs.add(seg);
}
}
//设置句子的结束标志
terminal = new Segment();
terminal.word = Segment.END_SIGN;
terminal.endChar = Segment.END_SIGN;
terminal.lastChar = sentence.substring(sentence.length() - 1, sentence.length());
segs.add(terminal);
return segs;
}
public String dynamicSegment(List<Segment> segs) {
//基于动态规划的概率统计分词
final double INFINITE = 9999999;
if (segs == null || segs.size() == 0) {
System.out.println("找不到候选词");
return null;
}
int n = segs.size(); //候选词的个数
//单个词
double[][] costs = new double[n][n];
for (int i = 0; i < n - 1; i++) {
for (int j = 0; j < n; j++) {
String endChar = segs.get(i).endChar;
if (j == i && endChar.equals(segs.get(j).word)) {
costs[i][j] = segs.get(j).cost; //候选词j的概率
continue;
}
costs[i][j] = INFINITE;
}
}
//寻找前一个候选词
for (int i = 0; i < n - 1; i++) {
String endChar = segs.get(i).endChar;
for (int j = i + 1; j < n; j++) {
String lastChar = segs.get(j).lastChar;
if (lastChar != null && lastChar.equals(endChar) &&( j- i < 4)) { //j前缀词不为空,同时j的前缀词等于i的后缀词,且j和i之间的间隔不超过4个候选词
costs[i][j] = segs.get(j).cost; //候选词j的概率
}
}
}
int sp = 0; //开始点
int fp = n - 1; //结束点
double[] dist = new double[n]; // 记录累计概率, n为候选词的个数
List<List<Integer>> sPaths = new ArrayList<List<Integer>>();
List<Integer> list = new ArrayList<Integer>();
for (int i = 0; i < n; i++) {
dist[i] = costs[sp][i]; //i的累计概率的初始值为索引sp到索引i的词的概率
if (sp != i) {
list.add(i); //记录候选词的索引位置
}
if (dist[i] < INFINITE) {
List<Integer> spa = new ArrayList<Integer>(); //如果索引sp到索引i构成一个词,则开启一条划分路径
sPaths.add(spa);
} else {
sPaths.add(null);
}
}
while (!list.isEmpty()) {
//选切分点
Integer minIdx = list.get(0);
list.remove(minIdx);
//判断minIdx是否为开头的候选词
if(dist[minIdx] == INFINITE){
continue;
}
//动态规划
for (int i = minIdx+1; i < n; i++) {
if (dist[i] > dist[minIdx] + costs[minIdx][i]) {
dist[i] = dist[minIdx] + costs[minIdx][i];
List<Integer> tmp = new ArrayList<Integer>(sPaths.get(minIdx));
tmp.add(minIdx);
sPaths.set(i, tmp); //记录最佳前候选词列表
}
}
}
String result = "";
for (int i = 0; i < sPaths.get(fp).size(); i++) {
result += segs.get(sPaths.get(fp).get(i)).word + "/ ";
}
return result;
}
public String segment(String sentences) {
return dynamicSegment(preSegment(sentences));
}
public static void main(String[] args) throws ClassNotFoundException, IOException {
ChnSeq cs = new ChnSeq();
cs.init();
String sentence = "在这一年中,改革开放和现代化建设继续向前迈进。经济保持了“高增长、低通胀”的良好发展态势。农业生产再次获得好的收成,企业改革继续深化,人民生活进一步改善。对外经济技术合作与交流不断扩大。";
String segs = cs.segment(sentence);
System.out.println(segs);
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 153
- 154
- 155
- 156
- 157
- 158
- 159
- 160
- 161
- 162
- 163
- 164
- 165
- 166
- 167
- 168
- 169
- 170
- 171
- 172
- 173
- 174
- 175
- 176
- 177
- 178
- 179
- 180
- 181
- 182
- 183
- 184
- 185
- 186
- 187
- 188
- 189
- 190
- 191
- 192
- 193
- 194
- 195
- 196
- 197
- 198
- 199
- 200
- 201
- 202
- 203
- 204
- 205
- 206
- 207
- 208
- 209
- 210
- 211
- 212
- 213
- 214
- 215
- 216
- 217
- 218
- 219
- 220
- 221
- 222
- 223
- 224
- 225
- 226
- 227
- 228
- 229
- 230
- 231
具体的代码和字典,可以访问:
https://github.com/Quincy1994/Segment