转载经典的分词方法实现（JAVA)

最新推荐文章于 2025-03-09 19:18:41 发布

1232456211

最新推荐文章于 2025-03-09 19:18:41 发布

阅读量3.4k

点赞数 1

分类专栏：分词

分词专栏收录该内容

1 篇文章

订阅专栏

本文详细介绍了基于规则的自动分词算法，包括正向最大匹配法、逆向最大匹配法和双向匹配法，并深入探讨了基于统计的中文分词算法，通过动态规划找到概率最大的分词路径。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

基于规则的自动分词算法

原理

(1) 事先人工建立好分词词典和分词规则库。
(2) 原理为基于字符串匹配进行分词,这样就要求有足够大的词表为依据。
(3) 通过一定的算法来实现,如正向最大匹配法、逆向最大匹配法、双向匹配法等。
(4) 忧缺点:当分词词典所收容的词较少时,显然覆盖度就有限,分词的正确率就低。

正向最大匹配法

算法描述

设MaxLen表示最大词长,D为分词词典
(1) 从待切分语料中按正向取长度为MaxLen的字串str,令Len=MaxLen;
(2) 把str与D中的词相匹配;
(3) 若匹配成功,则认为该字串为词,指向待切分语料的指针向前移Len个汉字(字节),返回到(1);
(4) 若不成功:如果Len>1,则将Len减2,从待切分语料中取长度为Len的字串str,返回到(2)。否则,得到长度为2的单字词,指向待切分语料的指针向前移1个汉字,返回(1)。

算法代码

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 *
 * @author quincy1994
 */
public class Nlp {

    private String m_sResult = ""; // 切分后的结果串
    private int m_nPosIndex;  // 指向待切分语料的指针的具体位置
    private int m_MaxLen; // 最大取词长
    private int totalMaxLen; //总最大取词长
    private Set<String> dictionary; // 分词字典

    public Nlp(int maxLen){
        this.m_MaxLen = maxLen;
        this.m_nPosIndex = 0;
        this.totalMaxLen = maxLen;
        try {
            this.dictionary = this.loadFile();
        } catch (IOException ex) {
            Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Nlp(){
        this.m_MaxLen = 3;
        this.totalMaxLen = 3;
        this.m_nPosIndex = 0;
        try {
            this.dictionary = this.loadFile();
        } catch (IOException ex) {
            Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Set<String> loadFile() throws FileNotFoundException, IOException{
        //读取字典
        Set<String> dictionary = new HashSet<String>();
        String filename = "dict.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while( ( tmp = br.readLine() )!=null){
            String[] token = tmp.split(",");
            String word = token[0];
            dictionary.add(word);
        }
        return dictionary;
    }
    public String MMSegment(String source){
         int len = totalMaxLen;
         int frompos = 0;
         MM(source, len, frompos);
         return m_sResult;
     }
    public String getSubString(String source, int m_nPosIndex, int len){
        int endIndex = m_nPosIndex + len;
        int length = source.length();

        //需要判断是否超出句子边界
        while(endIndex > length){
            endIndex -= 1;
        }
        String sub = source.substring(m_nPosIndex, endIndex);
        return sub;
    }
    public void MM(String source, int len , int frompos){

        //递归匹配
         if (m_nPosIndex >= source.length()) return;
        String sub = getSubString(source, m_nPosIndex,len);
        if(dictionary.contains(sub)){
            //匹配
            m_sResult += sub + "/ ";
            m_nPosIndex = m_nPosIndex + m_MaxLen;
            m_MaxLen = totalMaxLen;
            MM(source, m_MaxLen, m_nPosIndex);
        }
        else{
            //不匹配
            if(m_MaxLen > 1){
                m_MaxLen = m_MaxLen - 1;
                MM(source, m_MaxLen, m_nPosIndex);
            }
            else{
                m_sResult += sub+ "/ ";
                m_nPosIndex  += 1;
                m_MaxLen = totalMaxLen;
                MM(source, m_MaxLen, m_nPosIndex);
            }
    }
}
    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        // TODO code application logic here
        Nlp nlp = new Nlp();
        String source = "今天天气不错！";
        String result = nlp.MMSegment(source);
        System.out.println(result);
    } 
}

 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

逆向最大匹配法

算法描述

与正向最大匹配法原理一样，只是匹配的开始为句尾

代码实现

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 *
 * @author quincy1994
 */
public class RMM {
    private String m_sResult = "";         //切分后的结果串
    private int m_nPosIndex;                //游标指针
    private int m_MaxLen;                    //最大取词长
    private int totalMaxlen;                //总最大取词长
    private Set<String> dictionary;      //分词字典

    public RMM(int maxLen){
        this.m_MaxLen = maxLen;
        this.totalMaxlen = maxLen;
        try {
            this.dictionary = loadFile();
        } catch (IOException ex) {
            Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public RMM(){
        this.m_MaxLen = 3;
        this.totalMaxlen = 3;
        try {
            this.dictionary = loadFile();
        } catch (IOException ex) {
            Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Set<String> loadFile() throws IOException{

        //读取字典
        Set<String> dictionary = new HashSet<String>();
        String filename = "dict.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while((tmp=br.readLine())!= null){
            String[] token = tmp.split(",");
            String word = token[0];
            dictionary.add(word);
        }
        return dictionary;
    }
    public String RMMSegment(String source){
        int len= totalMaxlen;
        this.m_nPosIndex = source.length();
        int frompos = this.m_nPosIndex;
        rmm(source, m_MaxLen, m_nPosIndex);

        //将结果按顺序输出
        String[] token = m_sResult.split("/");
        String result = "";
        for(int i = token.length-1; i > 0 ; i--){
            result += token[i] + "/ ";
        }
        return result;
    }
    public String getSubString(String source, int m_nPosIndex, int len){

        int startIndex = m_nPosIndex - len;
        //判断越界条件
        while(startIndex < 0){
            startIndex += 1;
        }
        String sub = source.substring(startIndex, m_nPosIndex);
        return sub;
    }

    public void rmm(String source, int len, int frompos){
         if(m_nPosIndex < 0)  return;
         String sub = getSubString(source, m_nPosIndex, len);
         if(dictionary.contains(sub)){
             //匹配成功
             m_sResult += "/" + sub ;
             m_nPosIndex = m_nPosIndex - m_MaxLen;
             m_MaxLen = totalMaxlen;
             rmm(source, m_MaxLen, m_nPosIndex);
         }
         else{
             //不匹配
             if(m_MaxLen > 1){
                 m_MaxLen = m_MaxLen - 1;
                 rmm(source, m_MaxLen, m_nPosIndex);
             }
             else{
                 m_sResult += "/" + sub ;
                 m_nPosIndex -= 1;
                 m_MaxLen = totalMaxlen;
                 rmm(source, m_MaxLen, m_nPosIndex);
            }
        }
    }
    public static void main(String[] args) {
        // TODO code application logic here
        RMM myRMM = new RMM();
        String source = "记录最佳前候选词列表";
        String result = myRMM.RMMSegment(source);
        System.out.println(result);
    } 
}

 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

基于统计的中文分词算法

基本思想

选择概率最大的分词路径作为最优结果
利用动态规划算法来实现,即最优路径中的第i个词w i 的累计概率等于它的左相邻词w i-1 的累积概率乘以w i 自身的概率

具体算法

(1)对一个待分词的字串S,按照从左到右的顺序取出全部候选词w 1 ,w 2 ,…,w i ,w n ;
(2)计算每个候选词的概率值P(w i ),记录每个候选词的全部左邻词;
(3)计算每个候选词的累计概率,累计概率最大的候选词为最佳左邻词;
如果当前词w n 是字串的尾词,且累计概率P’(w n )最大,则w n 是S的终点词;
(4)从w n 开始,按照从右到左顺序,依次将每个词的最佳左邻词输出,即S的分词结果.

字典树

又称单词查找树，Trie树，是一种树形结构，是一种哈希树的变种。典型应用是用于统计，排序和保存大量的字符串（但不仅限于字符串），所以经常被搜索引擎系统用于文本词频统计。它的优点是：利用字符串的公共前缀来减少查询时间，最大限度地减少无谓的字符串比较，查询效率比哈希树高。

字典树的代码实现

主要参考:http://blog.youkuaiyun.com/sadfishsc/article/details/9152647

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.util.HashMap;
import java.util.Map;

/**
 *
 * @author quincy1994
 */
public class TireNode {
    private String character;           //　单个汉字
    private int frequency = -1;       //     词频, -1来区别某条路径上的字串是否是一个词组
    private double antilog = -1;    //      对数化的词频
    private Map<String, TireNode> children;  //下一个节点

    public String getCharacter(){
        return character;
    }

    public void setCharacter(String character){
        this.character = character;
    }

    public int getFrequency(){
        return frequency;
    }

    public void setFrequency(int frequency){
        this.frequency = frequency;
    }

    public double getAntilog(){
        return antilog;
    }

    public void setAntilog(double antilog){
        this.antilog = antilog;
    }

    public void addChild(TireNode node){
        if (children == null){
            children = new HashMap<String, TireNode>();
        }
        if (!children.containsKey(node.getCharacter())){
            children.put(node.getCharacter(), node);
        }
    }

    public TireNode getChild(String ch){
        if (children == null || ! children.containsKey(ch)){
            return null;
        }
        return children.get(ch);
    }

    public void removeChildren(String ch){
        if (children == null || !children.containsKey(ch)){
            return;
        }
        children.remove(ch);
    }
}

 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

算法实现

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 *
 * @author quincy1994
 */
public class ChnSeq {

    private TireNode tire = null;

    public List<String> loadFile() throws FileNotFoundException, IOException {
        //读取字典
        List<String> lines = new ArrayList<String>();
        String filename = "wordFre.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while ((tmp = br.readLine()) != null) {
            lines.add(tmp);
        }
        br.close();
        return lines;
    }

    public void init() throws IOException {
        List<String> lines = loadFile();
        tire = new TireNode();

        for (String line : lines) {
            String[] tokens = line.split(",");
            String word = tokens[0];
            int freq = Integer.parseInt(tokens[1]);
            double antilog =  Math.log(1+0.01/Double.parseDouble(tokens[2].replace("%", ""))) ;
            //构建词典树
            TireNode root = tire;
            for (int i = 0; i < word.length(); i++) {
                String c = "" + word.charAt(i);
                TireNode node = root.getChild(c);
                if (node == null) {
                    node = new TireNode();
                    node.setCharacter(c);
                    root.addChild(node);
                }
                root = node;
            }
            root.setFrequency(freq);    //为每个词设立词频
            root.setAntilog(antilog);   //为每个词设立逆文档频率
        }

    }

    public TireNode getTire() {
        return tire;
    }

    public TireNode getNodeByWord(String word) {
        TireNode node = tire;
        for (int i = 0; i < word.length(); i++) {
            String ch = word.charAt(i) + "";
            if (node == null) {
                break;
            } else {
                node = node.getChild(ch);
            }
        }
        return node;
    }

    private class Segment {

        public String word;     //词
        public String endChar; //结束词
        public String lastChar; //前缀词
        public double cost;

        public final static String START_SIGN = "<< STARTING >>";
        public final static String END_SIGN = "<< ENDING >>";
    }

    //寻找候选词
    public List<Segment> preSegment(String sentence) {
        List<Segment> segs = new ArrayList<Segment>();

        //设置句子的开始标志
        Segment terminal = new Segment();
        terminal.word = Segment.START_SIGN;
        terminal.endChar = Segment.START_SIGN;
        terminal.lastChar = null;
        segs.add(terminal);

        for (int i = 0; i < sentence.length(); i++) {
            for (int j = i + 1; j <= sentence.length(); j++) {
                String word = sentence.substring(i, j);
                TireNode tnode = this.getNodeByWord(word);
                if (tnode == null) {
                    break;
                }
                if (tnode.getFrequency() <= 0) {
                    continue;
                }

                Segment seg = new Segment();
                seg.word = word;
                seg.endChar = word.substring(word.length() - 1, word.length());
                if (i == 0) {
                    seg.lastChar = Segment.START_SIGN;
                } else {
                    seg.lastChar = sentence.substring(i - 1, i);
                }
                seg.cost = tnode.getAntilog();
                System.out.println(word + " " + seg.cost +" " + tnode.getFrequency());
                segs.add(seg);
            }
        }

        //设置句子的结束标志
        terminal = new Segment();
        terminal.word = Segment.END_SIGN;
        terminal.endChar = Segment.END_SIGN;
        terminal.lastChar = sentence.substring(sentence.length() - 1, sentence.length());
        segs.add(terminal);

        return segs;
    }

    public String dynamicSegment(List<Segment> segs) {

        //基于动态规划的概率统计分词
        final double INFINITE = 9999999;

        if (segs == null || segs.size() == 0) {
            System.out.println("找不到候选词");
            return null;
        }

        int n = segs.size();    //候选词的个数

        //单个词
        double[][] costs = new double[n][n];
        for (int i = 0; i < n - 1; i++) {
            for (int j = 0; j < n; j++) {
                String endChar = segs.get(i).endChar;
                if (j == i && endChar.equals(segs.get(j).word)) {
                    costs[i][j] = segs.get(j).cost;    //候选词j的概率
                    continue;
                }
                costs[i][j] = INFINITE;
            }
        }

        //寻找前一个候选词
        for (int i = 0; i < n - 1; i++) {
            String endChar = segs.get(i).endChar;
            for (int j = i + 1; j < n; j++) {
                String lastChar = segs.get(j).lastChar;
                if (lastChar != null && lastChar.equals(endChar) &&( j- i < 4)) {       //ｊ前缀词不为空，同时ｊ的前缀词等于ｉ的后缀词,且j和i之间的间隔不超过4个候选词
                    costs[i][j] = segs.get(j).cost;    //候选词j的概率
                }
            }
        }

        int sp = 0;   //开始点
        int fp = n - 1;    //结束点

        double[] dist = new double[n];         // 记录累计概率, n为候选词的个数
        List<List<Integer>> sPaths = new ArrayList<List<Integer>>();
        List<Integer> list = new ArrayList<Integer>();
        for (int i = 0; i < n; i++) {
            dist[i] = costs[sp][i];    //ｉ的累计概率的初始值为索引sp到索引ｉ的词的概率
            if (sp != i) {
                list.add(i);   //记录候选词的索引位置
            }
            if (dist[i] < INFINITE) {
                List<Integer> spa = new ArrayList<Integer>();     //如果索引sp到索引ｉ构成一个词，则开启一条划分路径
                sPaths.add(spa);
            } else {
                sPaths.add(null);
            }
        }
        while (!list.isEmpty()) {

            //选切分点
            Integer minIdx = list.get(0);
            list.remove(minIdx);

            //判断minIdx是否为开头的候选词
            if(dist[minIdx] == INFINITE){
                continue;
            }

            //动态规划
            for (int i = minIdx+1; i < n; i++) {
                if (dist[i] > dist[minIdx] + costs[minIdx][i]) {
                    dist[i] = dist[minIdx] + costs[minIdx][i];
                    List<Integer> tmp = new ArrayList<Integer>(sPaths.get(minIdx));
                    tmp.add(minIdx);
                    sPaths.set(i, tmp);  //记录最佳前候选词列表
                }
            }
        }
        String result = "";
        for (int i = 0; i < sPaths.get(fp).size(); i++) {
            result += segs.get(sPaths.get(fp).get(i)).word + "/ ";
        }
        return result;
    }

    public String segment(String sentences) {
        return dynamicSegment(preSegment(sentences));
    }

    public static void main(String[] args) throws ClassNotFoundException, IOException {
        ChnSeq cs = new ChnSeq();
        cs.init();
        String sentence = "在这一年中，改革开放和现代化建设继续向前迈进。经济保持了“高增长、低通胀”的良好发展态势。农业生产再次获得好的收成，企业改革继续深化，人民生活进一步改善。对外经济技术合作与交流不断扩大。";
        String segs = cs.segment(sentence);
        System.out.println(segs);
    }
}

 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231