Python工程师Java之路(n)手写中文分词

本文介绍了一种基于Java的手写中文分词算法实现,包括使用贝叶斯网络结合动态规划的方法以及HMM(隐马尔可夫模型)进行分词的过程。提供了完整的代码示例及测试案例。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

手写中文分词极简代码

  • 徒手编写Java中文分词【贝叶斯网络+动态规划】
  • 点击此处可查看中文分词算法原理
  • 用法:传入自定义词典(格式HashMap<String, Integer>)创建对象,然后cut即可
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

public class Tokenizer {
    private static final double minDouble = -9e99;
    private static final String reEn = "[a-zA-Z]+";
    private static final String reNum = "[0-9]+%?|[0-9]+[.][0-9]+%?";
    private HashMap<String, Integer> w2f; // word2frequency
    private int maxLen = 1;
    private double logTotal;

    class Pair {
        double _1;
        int _2;

        Pair(double a, int b) {
            _1 = a;
            _2 = b;
        }

        @Override
        public String toString() {
            return "(" + _1 + "," + _2 + ")";
        }
    }

    public Tokenizer(HashMap<String, Integer> w2f) {
        this.w2f = w2f;
        int total = 0;
        for (Map.Entry<String, Integer> kv : w2f.entrySet()) {
            int len = kv.getKey().length();
            if (len > maxLen) {
                maxLen = len;
            }
            total += kv.getValue();
        }
        logTotal = Math.log10(total);
    }

    public HashMap<Integer, Pair> calculate(String clause) {
        // 句子长度
        int len = clause.length();
        // 有向无环图
        HashMap<Integer, ArrayList<Integer>> DAG = new HashMap<>();
        // 词图扫描
        for (int head = 0; head < len; head++) {
            int tail = Math.min(len, head + maxLen);
            DAG.put(head, new ArrayList<>());
            DAG.get(head).add(head);
            for (int mid = head + 2; mid < tail + 1; mid++) {
                String word = clause.substring(head, mid);
                if (w2f.containsKey(word)) {
                    DAG.get(head).add(mid - 1); // 词库匹配
                } else if (word.matches(reEn)) {
                    DAG.get(head).add(mid - 1); // 英文匹配
                } else if (word.matches(reNum)) {
                    DAG.get(head).add(mid - 1); // 数字匹配
                }
            }
        }
        // 最短路径
        HashMap<Integer, Pair> route = new HashMap<>();
        route.put(len, new Pair(0.0, 0));
        // 动态规划
        for (int i = len - 1; i > -1; i--) {
            Pair maxStatus = new Pair(minDouble, 0);
            for (Integer x : DAG.get(i)) {
                double logFreq = Math.log10(w2f.getOrDefault(clause.substring(i, x + 1), 1));
                double status = logFreq - logTotal + route.get(x + 1)._1;
                if (status > maxStatus._1) {
                    maxStatus._1 = status;
                    maxStatus._2 = x;
                }
            }
            route.put(i, maxStatus);
        }
        return route;
    }

    public ArrayList<String> cut(String clause) {
        // 计算最短路径
        HashMap<Integer, Pair> route = calculate(clause);
        // 句子长度
        int len = clause.length();
        // 分词列表
        ArrayList<String> words = new ArrayList<>();
        // 根据最短路径取词
        int x = 0;
        while (x < len) {
            int y = route.get(x)._2 + 1;
            String l_word = clause.substring(x, y);
            words.add(l_word);
            x = y;
        }
        return words;
    }
}

测试:

HashMap<String, Integer> w2f = new HashMap<>();
w2f.put("空调", 2);
w2f.put("调和", 2);
w2f.put("和风", 2);
w2f.put("风扇", 2);
w2f.put("和", 2);
Tokenizer tk = new Tokenizer(w2f);
System.out.println(tk.cut("空调和风扇99元"));
// 结果打印:[空调, 和, 风扇, 99, 元]

HMM分词(不带词典)

import java.util.ArrayList;
import java.util.HashMap;

public class HmmTokenizer {
    private static double minDouble = -9e90;
    private static double minDouble99 = -9e99;
    private static char[] states = new char[]{'B', 'M', 'E', 'S'};
    private static HashMap<Character, char[]> prevStatus = new HashMap<>();
    private static HashMap<Character, Double> startP = new HashMap<>();
    private static HashMap<Character, HashMap<Character, Double>> transP = new HashMap<>();
    static HashMap<Character, HashMap<Character, Double>> emitP = new HashMap<>();

    static {
        // 前状态
        prevStatus.put('B', new char[]{'E', 'S'});
        prevStatus.put('M', new char[]{'M', 'B'});
        prevStatus.put('S', new char[]{'S', 'E'});
        prevStatus.put('E', new char[]{'B', 'M'});
        // 初始概率
        startP.put('B', -0.26268660809250016);
        startP.put('E', minDouble);
        startP.put('M', minDouble);
        startP.put('S', -1.4652633398537678);
        // 转移概率
        transP.put('B', new HashMap<>());
        transP.put('E', new HashMap<>());
        transP.put('M', new HashMap<>());
        transP.put('S', new HashMap<>());
        transP.get('B').put('E', -0.51082562376599);
        transP.get('B').put('M', -0.916290731874155);
        transP.get('E').put('B', -0.5897149736854513);
        transP.get('E').put('S', -0.8085250474669937);
        transP.get('M').put('E', -0.33344856811948514);
        transP.get('M').put('M', -1.2603623820268226);
        transP.get('S').put('B', -0.7211965654669841);
        transP.get('S').put('S', -0.6658631448798212);
        // 发射概率
        emitP.put('B', new HashMap<>());
        emitP.put('E', new HashMap<>());
        emitP.put('M', new HashMap<>());
        emitP.put('S', new HashMap<>());
    }

    public ArrayList<Character> viterbi(String obs) {
        int len = obs.length();
        ArrayList<HashMap<Character, Double>> V = new ArrayList<>();
        V.add(new HashMap<>());
        HashMap<Character, ArrayList<Character>> path = new HashMap<>();
        for (char y : states) {
            V.get(0).put(y, startP.get(y) + emitP.get(y).getOrDefault(obs.charAt(0), minDouble));
            path.put(y, new ArrayList<>());
            path.get(y).add(y);
        }
        for (int t = 1; t < len; t++) {
            V.add(new HashMap<>());
            HashMap<Character, ArrayList<Character>> newPath = new HashMap<>();
            for (char y : states) {
                double emP = emitP.get(y).getOrDefault(obs.charAt(t), minDouble);
                double maxProb = minDouble99;
                char state = 0;
                for (char y0 : prevStatus.get(y)) {
                    double prob = V.get(t - 1).get(y0) + transP.get(y0).getOrDefault(y, minDouble) + emP;
                    if (prob > maxProb) {
                        maxProb = prob;
                        state = y0;
                    }
                }
                V.get(t).put(y, maxProb);
                newPath.put(y, new ArrayList<>());
                newPath.get(y).addAll(path.get(state));
                newPath.get(y).add(y);
            }
            path = newPath;
        }
        double probE = V.get(len - 1).get('E');
        double probS = V.get(len - 1).get('S');
        char state;
        if (probE > probS) {
            state = 'E';
        } else {
            state = 'S';
        }
        return path.get(state);
    }

    public ArrayList<String> cut_without_dict(String clause) {
        ArrayList<Character> posList = viterbi(clause);
        int begin = 0;
        int nextI = 0;
        ArrayList<String> words = new ArrayList<>();
        int len = clause.length();
        for (int i = 0; i < len; i++) {
            char pos = posList.get(i);
            if (pos == 'B') {
                begin = i;
            } else if (pos == 'E') {
                words.add(clause.substring(begin, i + 1));
                nextI = i + 1;
            } else if (pos == 'S') {
                words.add(clause.substring(i, i + 1));
                nextI = i + 1;
            }
        }
        if (nextI < len) {
            words.add(clause.substring(nextI));
        }
        return words;
    }
}

测试:

HmmTokenizer.emitP.get('B').put('南', -5.5);
HmmTokenizer.emitP.get('M').put('南', -9.9);
HmmTokenizer.emitP.get('E').put('南', -9.9);
HmmTokenizer.emitP.get('S').put('南', -9.9);
HmmTokenizer.emitP.get('B').put('门', -9.9);
HmmTokenizer.emitP.get('M').put('门', -9.9);
HmmTokenizer.emitP.get('E').put('门', -4.4);
HmmTokenizer.emitP.get('S').put('门', -9.9);
HmmTokenizer.emitP.get('B').put('大', -9.9);
HmmTokenizer.emitP.get('M').put('大', -8.8);
HmmTokenizer.emitP.get('E').put('大', -3.3);
HmmTokenizer.emitP.get('S').put('大', -9.9);
HmmTokenizer.emitP.get('B').put('的', -9.9);
HmmTokenizer.emitP.get('M').put('的', -9.9);
HmmTokenizer.emitP.get('E').put('的', -9.9);
HmmTokenizer.emitP.get('S').put('的', -1.1);
HmmTokenizer tk = new HmmTokenizer();
System.out.println(tk.viterbi("南大的南大门"));          // [B, E, S, B, M, E]
System.out.println(tk.cut_without_dict("南大的南大门")); // [南大, 的, 南大门]

代码下载

https://download.youkuaiyun.com/download/Yellow_python/16498500

评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小基基o_O

您的鼓励是我创作的巨大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值