Python工程师Java之路（n）手写中文分词

最新推荐文章于 2025-01-24 12:27:28 发布

小基基o_O

最新推荐文章于 2025-01-24 12:27:28 发布

阅读量326

点赞数 2

CC 4.0 BY-SA版权

分类专栏： Java

本文链接：https://blog.youkuaiyun.com/Yellow_python/article/details/115035978

Java 专栏收录该内容

32 篇文章

订阅专栏

本文介绍了一种基于Java的手写中文分词算法实现，包括使用贝叶斯网络结合动态规划的方法以及HMM（隐马尔可夫模型）进行分词的过程。提供了完整的代码示例及测试案例。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

手写中文分词极简代码

徒手编写Java中文分词【贝叶斯网络+动态规划】
点击此处可查看中文分词算法原理
用法：传入自定义词典（格式HashMap<String, Integer>）创建对象，然后cut即可

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

public class Tokenizer {
    private static final double minDouble = -9e99;
    private static final String reEn = "[a-zA-Z]+";
    private static final String reNum = "[0-9]+%?|[0-9]+[.][0-9]+%?";
    private HashMap<String, Integer> w2f; // word2frequency
    private int maxLen = 1;
    private double logTotal;

    class Pair {
        double _1;
        int _2;

        Pair(double a, int b) {
            _1 = a;
            _2 = b;
        }

        @Override
        public String toString() {
            return "(" + _1 + "," + _2 + ")";
        }
    }

    public Tokenizer(HashMap<String, Integer> w2f) {
        this.w2f = w2f;
        int total = 0;
        for (Map.Entry<String, Integer> kv : w2f.entrySet()) {
            int len = kv.getKey().length();
            if (len > maxLen) {
                maxLen = len;
            }
            total += kv.getValue();
        }
        logTotal = Math.log10(total);
    }

    public HashMap<Integer, Pair> calculate(String clause) {
        // 句子长度
        int len = clause.length();
        // 有向无环图
        HashMap<Integer, ArrayList<Integer>> DAG = new HashMap<>();
        // 词图扫描
        for (int head = 0; head < len; head++) {
            int tail = Math.min(len, head + maxLen);
            DAG.put(head, new ArrayList<>());
            DAG.get(head).add(head);
            for (int mid = head + 2; mid < tail + 1; mid++) {
                String word = clause.substring(head, mid);
                if (w2f.containsKey(word)) {
                    DAG.get(head).add(mid - 1); // 词库匹配
                } else if (word.matches(reEn)) {
                    DAG.get(head).add(mid - 1); // 英文匹配
                } else if (word.matches(reNum)) {
                    DAG.get(head).add(mid - 1); // 数字匹配
                }
            }
        }
        // 最短路径
        HashMap<Integer, Pair> route = new HashMap<>();
        route.put(len, new Pair(0.0, 0));
        // 动态规划
        for (int i = len - 1; i > -1; i--) {
            Pair maxStatus = new Pair(minDouble, 0);
            for (Integer x : DAG.get(i)) {
                double logFreq = Math.log10(w2f.getOrDefault(clause.substring(i, x + 1), 1));
                double status = logFreq - logTotal + route.get(x + 1)._1;
                if (status > maxStatus._1) {
                    maxStatus._1 = status;
                    maxStatus._2 = x;
                }
            }
            route.put(i, maxStatus);
        }
        return route;
    }

    public ArrayList<String> cut(String clause) {
        // 计算最短路径
        HashMap<Integer, Pair> route = calculate(clause);
        // 句子长度
        int len = clause.length();
        // 分词列表
        ArrayList<String> words = new ArrayList<>();
        // 根据最短路径取词
        int x = 0;
        while (x < len) {
            int y = route.get(x)._2 + 1;
            String l_word = clause.substring(x, y);
            words.add(l_word);
            x = y;
        }
        return words;
    }
}

测试：

HashMap<String, Integer> w2f = new HashMap<>();
w2f.put("空调", 2);
w2f.put("调和", 2);
w2f.put("和风", 2);
w2f.put("风扇", 2);
w2f.put("和", 2);
Tokenizer tk = new Tokenizer(w2f);
System.out.println(tk.cut("空调和风扇99元"));
// 结果打印：[空调, 和, 风扇, 99, 元]

HMM分词（不带词典）

import java.util.ArrayList;
import java.util.HashMap;

public class HmmTokenizer {
    private static double minDouble = -9e90;
    private static double minDouble99 = -9e99;
    private static char[] states = new char[]{'B', 'M', 'E', 'S'};
    private static HashMap<Character, char[]> prevStatus = new HashMap<>();
    private static HashMap<Character, Double> startP = new HashMap<>();
    private static HashMap<Character, HashMap<Character, Double>> transP = new HashMap<>();
    static HashMap<Character, HashMap<Character, Double>> emitP = new HashMap<>();

    static {
        // 前状态
        prevStatus.put('B', new char[]{'E', 'S'});
        prevStatus.put('M', new char[]{'M', 'B'});
        prevStatus.put('S', new char[]{'S', 'E'});
        prevStatus.put('E', new char[]{'B', 'M'});
        // 初始概率
        startP.put('B', -0.26268660809250016);
        startP.put('E', minDouble);
        startP.put('M', minDouble);
        startP.put('S', -1.4652633398537678);
        // 转移概率
        transP.put('B', new HashMap<>());
        transP.put('E', new HashMap<>());
        transP.put('M', new HashMap<>());
        transP.put('S', new HashMap<>());
        transP.get('B').put('E', -0.51082562376599);
        transP.get('B').put('M', -0.916290731874155);
        transP.get('E').put('B', -0.5897149736854513);
        transP.get('E').put('S', -0.8085250474669937);
        transP.get('M').put('E', -0.33344856811948514);
        transP.get('M').put('M', -1.2603623820268226);
        transP.get('S').put('B', -0.7211965654669841);
        transP.get('S').put('S', -0.6658631448798212);
        // 发射概率
        emitP.put('B', new HashMap<>());
        emitP.put('E', new HashMap<>());
        emitP.put('M', new HashMap<>());
        emitP.put('S', new HashMap<>());
    }

    public ArrayList<Character> viterbi(String obs) {
        int len = obs.length();
        ArrayList<HashMap<Character, Double>> V = new ArrayList<>();
        V.add(new HashMap<>());
        HashMap<Character, ArrayList<Character>> path = new HashMap<>();
        for (char y : states) {
            V.get(0).put(y, startP.get(y) + emitP.get(y).getOrDefault(obs.charAt(0), minDouble));
            path.put(y, new ArrayList<>());
            path.get(y).add(y);
        }
        for (int t = 1; t < len; t++) {
            V.add(new HashMap<>());
            HashMap<Character, ArrayList<Character>> newPath = new HashMap<>();
            for (char y : states) {
                double emP = emitP.get(y).getOrDefault(obs.charAt(t), minDouble);
                double maxProb = minDouble99;
                char state = 0;
                for (char y0 : prevStatus.get(y)) {
                    double prob = V.get(t - 1).get(y0) + transP.get(y0).getOrDefault(y, minDouble) + emP;
                    if (prob > maxProb) {
                        maxProb = prob;
                        state = y0;
                    }
                }
                V.get(t).put(y, maxProb);
                newPath.put(y, new ArrayList<>());
                newPath.get(y).addAll(path.get(state));
                newPath.get(y).add(y);
            }
            path = newPath;
        }
        double probE = V.get(len - 1).get('E');
        double probS = V.get(len - 1).get('S');
        char state;
        if (probE > probS) {
            state = 'E';
        } else {
            state = 'S';
        }
        return path.get(state);
    }

    public ArrayList<String> cut_without_dict(String clause) {
        ArrayList<Character> posList = viterbi(clause);
        int begin = 0;
        int nextI = 0;
        ArrayList<String> words = new ArrayList<>();
        int len = clause.length();
        for (int i = 0; i < len; i++) {
            char pos = posList.get(i);
            if (pos == 'B') {
                begin = i;
            } else if (pos == 'E') {
                words.add(clause.substring(begin, i + 1));
                nextI = i + 1;
            } else if (pos == 'S') {
                words.add(clause.substring(i, i + 1));
                nextI = i + 1;
            }
        }
        if (nextI < len) {
            words.add(clause.substring(nextI));
        }
        return words;
    }
}

测试：

HmmTokenizer.emitP.get('B').put('南', -5.5);
HmmTokenizer.emitP.get('M').put('南', -9.9);
HmmTokenizer.emitP.get('E').put('南', -9.9);
HmmTokenizer.emitP.get('S').put('南', -9.9);
HmmTokenizer.emitP.get('B').put('门', -9.9);
HmmTokenizer.emitP.get('M').put('门', -9.9);
HmmTokenizer.emitP.get('E').put('门', -4.4);
HmmTokenizer.emitP.get('S').put('门', -9.9);
HmmTokenizer.emitP.get('B').put('大', -9.9);
HmmTokenizer.emitP.get('M').put('大', -8.8);
HmmTokenizer.emitP.get('E').put('大', -3.3);
HmmTokenizer.emitP.get('S').put('大', -9.9);
HmmTokenizer.emitP.get('B').put('的', -9.9);
HmmTokenizer.emitP.get('M').put('的', -9.9);
HmmTokenizer.emitP.get('E').put('的', -9.9);
HmmTokenizer.emitP.get('S').put('的', -1.1);
HmmTokenizer tk = new HmmTokenizer();
System.out.println(tk.viterbi("南大的南大门"));          // [B, E, S, B, M, E]
System.out.println(tk.cut_without_dict("南大的南大门")); // [南大, 的, 南大门]