手写中文分词极简代码
- 徒手编写Java中文分词【贝叶斯网络+动态规划】
- 点击此处可查看中文分词算法原理
- 用法:传入自定义词典(格式
HashMap<String, Integer>
)创建对象,然后cut
即可
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
public class Tokenizer {
private static final double minDouble = -9e99;
private static final String reEn = "[a-zA-Z]+";
private static final String reNum = "[0-9]+%?|[0-9]+[.][0-9]+%?";
private HashMap<String, Integer> w2f; // word2frequency
private int maxLen = 1;
private double logTotal;
class Pair {
double _1;
int _2;
Pair(double a, int b) {
_1 = a;
_2 = b;
}
@Override
public String toString() {
return "(" + _1 + "," + _2 + ")";
}
}
public Tokenizer(HashMap<String, Integer> w2f) {
this.w2f = w2f;
int total = 0;
for (Map.Entry<String, Integer> kv : w2f.entrySet()) {
int len = kv.getKey().length();
if (len > maxLen) {
maxLen = len;
}
total += kv.getValue();
}
logTotal = Math.log10(total);
}
public HashMap<Integer, Pair> calculate(String clause) {
// 句子长度
int len = clause.length();
// 有向无环图
HashMap<Integer, ArrayList<Integer>> DAG = new HashMap<>();
// 词图扫描
for (int head = 0; head < len; head++) {
int tail = Math.min(len, head + maxLen);
DAG.put(head, new ArrayList<>());
DAG.get(head).add(head);
for (int mid = head + 2; mid < tail + 1; mid++) {
String word = clause.substring(head, mid);
if (w2f.containsKey(word)) {
DAG.get(head).add(mid - 1); // 词库匹配
} else if (word.matches(reEn)) {
DAG.get(head).add(mid - 1); // 英文匹配
} else if (word.matches(reNum)) {
DAG.get(head).add(mid - 1); // 数字匹配
}
}
}
// 最短路径
HashMap<Integer, Pair> route = new HashMap<>();
route.put(len, new Pair(0.0, 0));
// 动态规划
for (int i = len - 1; i > -1; i--) {
Pair maxStatus = new Pair(minDouble, 0);
for (Integer x : DAG.get(i)) {
double logFreq = Math.log10(w2f.getOrDefault(clause.substring(i, x + 1), 1));
double status = logFreq - logTotal + route.get(x + 1)._1;
if (status > maxStatus._1) {
maxStatus._1 = status;
maxStatus._2 = x;
}
}
route.put(i, maxStatus);
}
return route;
}
public ArrayList<String> cut(String clause) {
// 计算最短路径
HashMap<Integer, Pair> route = calculate(clause);
// 句子长度
int len = clause.length();
// 分词列表
ArrayList<String> words = new ArrayList<>();
// 根据最短路径取词
int x = 0;
while (x < len) {
int y = route.get(x)._2 + 1;
String l_word = clause.substring(x, y);
words.add(l_word);
x = y;
}
return words;
}
}
测试:
HashMap<String, Integer> w2f = new HashMap<>();
w2f.put("空调", 2);
w2f.put("调和", 2);
w2f.put("和风", 2);
w2f.put("风扇", 2);
w2f.put("和", 2);
Tokenizer tk = new Tokenizer(w2f);
System.out.println(tk.cut("空调和风扇99元"));
// 结果打印:[空调, 和, 风扇, 99, 元]
HMM分词(不带词典)
import java.util.ArrayList;
import java.util.HashMap;
public class HmmTokenizer {
private static double minDouble = -9e90;
private static double minDouble99 = -9e99;
private static char[] states = new char[]{'B', 'M', 'E', 'S'};
private static HashMap<Character, char[]> prevStatus = new HashMap<>();
private static HashMap<Character, Double> startP = new HashMap<>();
private static HashMap<Character, HashMap<Character, Double>> transP = new HashMap<>();
static HashMap<Character, HashMap<Character, Double>> emitP = new HashMap<>();
static {
// 前状态
prevStatus.put('B', new char[]{'E', 'S'});
prevStatus.put('M', new char[]{'M', 'B'});
prevStatus.put('S', new char[]{'S', 'E'});
prevStatus.put('E', new char[]{'B', 'M'});
// 初始概率
startP.put('B', -0.26268660809250016);
startP.put('E', minDouble);
startP.put('M', minDouble);
startP.put('S', -1.4652633398537678);
// 转移概率
transP.put('B', new HashMap<>());
transP.put('E', new HashMap<>());
transP.put('M', new HashMap<>());
transP.put('S', new HashMap<>());
transP.get('B').put('E', -0.51082562376599);
transP.get('B').put('M', -0.916290731874155);
transP.get('E').put('B', -0.5897149736854513);
transP.get('E').put('S', -0.8085250474669937);
transP.get('M').put('E', -0.33344856811948514);
transP.get('M').put('M', -1.2603623820268226);
transP.get('S').put('B', -0.7211965654669841);
transP.get('S').put('S', -0.6658631448798212);
// 发射概率
emitP.put('B', new HashMap<>());
emitP.put('E', new HashMap<>());
emitP.put('M', new HashMap<>());
emitP.put('S', new HashMap<>());
}
public ArrayList<Character> viterbi(String obs) {
int len = obs.length();
ArrayList<HashMap<Character, Double>> V = new ArrayList<>();
V.add(new HashMap<>());
HashMap<Character, ArrayList<Character>> path = new HashMap<>();
for (char y : states) {
V.get(0).put(y, startP.get(y) + emitP.get(y).getOrDefault(obs.charAt(0), minDouble));
path.put(y, new ArrayList<>());
path.get(y).add(y);
}
for (int t = 1; t < len; t++) {
V.add(new HashMap<>());
HashMap<Character, ArrayList<Character>> newPath = new HashMap<>();
for (char y : states) {
double emP = emitP.get(y).getOrDefault(obs.charAt(t), minDouble);
double maxProb = minDouble99;
char state = 0;
for (char y0 : prevStatus.get(y)) {
double prob = V.get(t - 1).get(y0) + transP.get(y0).getOrDefault(y, minDouble) + emP;
if (prob > maxProb) {
maxProb = prob;
state = y0;
}
}
V.get(t).put(y, maxProb);
newPath.put(y, new ArrayList<>());
newPath.get(y).addAll(path.get(state));
newPath.get(y).add(y);
}
path = newPath;
}
double probE = V.get(len - 1).get('E');
double probS = V.get(len - 1).get('S');
char state;
if (probE > probS) {
state = 'E';
} else {
state = 'S';
}
return path.get(state);
}
public ArrayList<String> cut_without_dict(String clause) {
ArrayList<Character> posList = viterbi(clause);
int begin = 0;
int nextI = 0;
ArrayList<String> words = new ArrayList<>();
int len = clause.length();
for (int i = 0; i < len; i++) {
char pos = posList.get(i);
if (pos == 'B') {
begin = i;
} else if (pos == 'E') {
words.add(clause.substring(begin, i + 1));
nextI = i + 1;
} else if (pos == 'S') {
words.add(clause.substring(i, i + 1));
nextI = i + 1;
}
}
if (nextI < len) {
words.add(clause.substring(nextI));
}
return words;
}
}
测试:
HmmTokenizer.emitP.get('B').put('南', -5.5);
HmmTokenizer.emitP.get('M').put('南', -9.9);
HmmTokenizer.emitP.get('E').put('南', -9.9);
HmmTokenizer.emitP.get('S').put('南', -9.9);
HmmTokenizer.emitP.get('B').put('门', -9.9);
HmmTokenizer.emitP.get('M').put('门', -9.9);
HmmTokenizer.emitP.get('E').put('门', -4.4);
HmmTokenizer.emitP.get('S').put('门', -9.9);
HmmTokenizer.emitP.get('B').put('大', -9.9);
HmmTokenizer.emitP.get('M').put('大', -8.8);
HmmTokenizer.emitP.get('E').put('大', -3.3);
HmmTokenizer.emitP.get('S').put('大', -9.9);
HmmTokenizer.emitP.get('B').put('的', -9.9);
HmmTokenizer.emitP.get('M').put('的', -9.9);
HmmTokenizer.emitP.get('E').put('的', -9.9);
HmmTokenizer.emitP.get('S').put('的', -1.1);
HmmTokenizer tk = new HmmTokenizer();
System.out.println(tk.viterbi("南大的南大门")); // [B, E, S, B, M, E]
System.out.println(tk.cut_without_dict("南大的南大门")); // [南大, 的, 南大门]
代码下载
https://download.youkuaiyun.com/download/Yellow_python/16498500