原创中文分词代码分享（1.2）——词典接口

双Hash结构词典实现

最新推荐文章于 2022-07-16 22:09:40 发布

最新推荐文章于 2022-07-16 22:09:40 发布 · 174 阅读

文章标签：

#Java

分词与索引专栏收录该内容

8 篇文章

订阅专栏

最后说一下双Hash结构的实现类DoubleHashDictionary类：

java 代码

/* * @作者:Hades , 创建日期:2006-11-17 * * 汕头大学03计算机本科 * */ package edu.stu.cn.segment.matching.dictionary; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.PrintStream; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.Hashtable; import java.util.LinkedList; /** * @author Hades Guan 基于词典分词方法中使用的词典实例 */ public class DoubleHashDictionary implements Serializable, DictionaryImpl { /** * serialVersionUID 的注释 */ private static final long serialVersionUID = -6085097706592874294L; /** * 词典索引表 */ private Hashtablenull; /** * 最大词长 */ private int maxWordLen = 0; /** * 词典长度 */ private int wordCount = 0; /** * 删除词典中的词word * * @param word * 待删除的词汇 */ public void deleteWord(String word) { if (word == null) return; // 过滤多于空格 word = word.trim(); // 获取词长 int len = word.length(); // 判断词长为len的二级索引表（首字hash表）是否为空 if (this.indexTable[len - 1] == null) return; // 获取词的首字 String fch = word.substring(0, 1); // 首字对应的词汇列表 ArrayList<string> wal = null; </string> if (this.indexTable[len - 1].containsKey(fch)) wal = this.indexTable[len - 1].get(fch); else return; // 判断是否包含该词汇 String str = word.substring(1, len); if (Collections.binarySearch(wal, str) >= 0) { wal.remove(str); this.indexTable[len - 1].put(fch, wal); } else return; } /** * @return 返回 maxWordLen。 */ public int getMaxWordLen() { return maxWordLen; } /** * @return 返回 wordCount。 */ public int getWordCount() { return wordCount; } /** * 将词汇word插入到词典文件中 * * @param word * 待插入的词汇 */ public void insertWord(String word) { if (word == null) return; // 过滤多于空格 word = word.trim(); // 获取词长 int len = word.length(); // 初始化二级索引表（首字hash表） if (this.indexTable[len - 1] == null) this.indexTable[len - 1] = new Hashtable // 获取词的首字 String fch = word.substring(0, 1); // 首字对应的词汇列表 ArrayList<string> wal = null; </string> if (this.indexTable[len - 1].containsKey(fch)) wal = this.indexTable[len - 1].get(fch); else wal = new ArrayList<string>(); </string> // 截取剩余部分 String str = word.substring(1, len); // 当词汇表中不存在当前词汇时插入新词汇 if (Collections.binarySearch(wal, str) < 0) wal.add(str); Collections.sort(wal); this.indexTable[len - 1].put(fch, wal); } /** * 载入以文本格式存储的词典 * * @param fileName * 词典的文件名 */ @SuppressWarnings("unchecked") public void loadDictionary(String fileName) { try { // 初始化输入流 BufferedReader in = new BufferedReader(new FileReader(fileName)); String word = null; // 初始化记录链表 LinkedList<string> wordLink = new LinkedList<string>(); </string></string> // 最大词长 this.maxWordLen = 0; // 读取词典 while ((word = in.readLine()) != null) { if (word.length() > this.maxWordLen) this.maxWordLen = word.length(); wordLink.add(word); this.wordCount++; } // 初始化一级索引表（词长索引表） this.indexTable = new Hashtable[this.maxWordLen]; // 重新遍历词典链表 for (String w : wordLink) { // 插入词汇 this.insertWord(w); } // 回收资源 wordLink.clear(); } catch (IOException e) { // TODO 自动生成 catch 块 e.printStackTrace(); } } /** * 判断输入的字符串是否在词典中 * * @param word * 待判断字符串 * @return 判断结果 */ public boolean match(String word) { if (word == null) return false; // 获取词长 int len = word.length(); // 当词长大于当前词库中最大词长则返回false if (len > this.maxWordLen) return false; // 当词长为len的hash索引表未被初始化时返回false if (this.indexTable[len - 1] == null) return false; // 获取首字 String fch = word.substring(0, 1); if (this.indexTable[len - 1].containsKey(fch)) { if (len == 1) return true; else { // 获取以fch开头的词汇表 ArrayList<string> wal = this.indexTable[len - 1].get(fch); </string> // 折半查找 if (Collections.binarySearch(wal, word.substring(1, len)) < 0) return false; else return true; } } else return false; } /** * 输出已载入内存中所有词汇 * * @param out * 输出流 */ public void print(PrintStream out) { for (int i = 0; i < this.indexTable.length; i++) { out.println("词长：" + (i + 1)); // 判断词典是否已初始化 if (this.indexTable[i] != null) { for (String fch : this.indexTable[i].keySet()) { out.println("首字：" + fch); for (String w : this.indexTable[i].get(fch)) out.println("\t" + w); } } } out.flush(); } }

为什么说是双Hash结构呢？因为在查询词汇时，先使用词汇的长度length作为第一次Hash的key取出Hashtable结构的value，接下来也就跟首字Hash查询的操作一样了：取首字作为key取出一维线性表的value后采用折半查找。当词典中词汇数目很大时，一维线性表过长，进行折半查找无疑会提高比较的次数从而降低了效率。而使用双Hash正是希望通过增加多一次Hash求值从而将长的词汇表剪短成为多段较短的一维线性表减低折半查找时的比较次数。

既然说道了序列化，当然少不了序列化操作类DictionaryUtil：

java 代码

/* * @作者:Hades , 创建日期:2006-11-18 * * 汕头大学03计算机本科 * */ package edu.stu.cn.segment.matching.util; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import edu.stu.cn.segment.matching.dictionary.DictionaryImpl; /** * @author Hades Guan 词典工具类 * */ public class DictionaryUtilextends DictionaryImpl> { /** * 从fileName文件中读入词典实例 * * @param fileName * 存储文件 * @return 词典实例 */ @SuppressWarnings("unchecked") public T readDictionary(String fileName) { try { ObjectInputStream in = new ObjectInputStream( new BufferedInputStream(new FileInputStream(fileName))); T dic = (T) in.readObject(); in.close(); return dic; } catch (Exception e) { System.err.println(e.getMessage()); return null; } } /** * 将词典实例dic写入到fileName文件中 * * @param dic * 词典实例 * @param fileName * 存储文件 * @return 操作成功与否 */ public boolean writeDictionary(T dic, String fileName) { try { ObjectOutputStream out = new ObjectOutputStream( new BufferedOutputStream(new FileOutputStream(fileName))); out.writeObject(dic); out.flush(); out.close(); return true; } catch (IOException e) { System.err.println(e.getMessage()); return false; } } }

通过这个操作类可以把实现DictionaryImpl接口的词典实现类写入文件或者从文件中读出。