0x00 题目
中文词频统计 语料库:1998-01-2003版-带音.txt 要求:输入txt文件,统计1元模型和2元模型,输出单词和词频文件,双词和词频文件。设计相应的接口,能够快速载入文件,并检索单词和双词。
0x01 解法
1. java
ChineseAndEnglish.java
package 实验三;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ChineseAndEnglish {
public static boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
|| ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
return true;
}
return false;
}
public static boolean isEnglish(String charaString){
return charaString.matches("^[a-zA-Z]*");
}
public static boolean isChinese(String str){
String regEx = "[\\u4e00-\\u9fa5]+";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
if(m.find())
return true;
else
return false;
}
}
FileOperate.java
package 实验三;
import java.io.*;
import java.util.ArrayList;
public class FileOperate {
public static void readInput(LineSolution solution,String filename) {
try {
InputStreamReader fr=new InputStreamReader(new FileInputStream(filename) ,"UTF-8");
BufferedReader br=new BufferedReader(fr);
String line="";
while(line!=null) {
line=br.readLine();
if(line == null){
break;
}else {
solution.solveLine(line);
}
}
} catch (IOException e) {
// TODO: handle exception
e.printStackTrace();
}
}
public static void writeWordOutput(int scale,ArrayList<String> list) {
try {
FileOutputStream f=new FileOutputStream("D:\\eclipse-workspace\\nlp\\"+scale+"-gram分词统计.txt");
OutputStreamWriter fw=new OutputStreamWriter(f,"UTF-8");
BufferedWriter bw=new BufferedWriter(fw);
for(String s:list) {
bw.write(s);
bw.newLine();
}
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void writePropertyOutput(int scale,ArrayList<String> list) {
try {
FileOutputStream f=new FileOutputStream("D:\\eclipse-workspace\\nlp\\"+scale+"-gram分词词性统计.txt");
OutputStreamWriter fw=new OutputStreamWriter(f,"UTF-8");
BufferedWriter bw=new BufferedWriter(fw);
for(String s:list) {
bw.write(s);
bw.newLine();
}
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
judge.java
package 实验三;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
public class Judge {
public static boolean isOneGram(String word) {
final Map<String,Integer> dictFreq = new HashMap<String,Integer>();
FileOperate.readInput(new LineSolution() {
@Override
public void solveLine(String line) {
String[] lineDicts = line.split("\ttimes: ");
if(lineDicts != null){
for(int i=0; i<lineDicts.length; i++){
String word = lineDicts[i];
if(dictFreq.containsKey(word)){
int num = dictFreq.get(word);
dictFreq.put(word, ++num);
}else {
dictFreq.put(word, 1);
}
}
}
}
}, "1-gram分词统计.txt");
for(Map.Entry<String,Integer> entry : dictFreq.entrySet()) {
if(entry.getKey()!=null) {
if(word.equals(entry.getKey())) {
System.out.println("该词是单词! "+"出现次数"+entry.getValue());
return true;
}
}
}
return false;
}
public static boolean isTwoGram(String word) {
final Map<String,Integer> dictFreq = new HashMap<String,Integer>();
FileOperate.readInput(new LineSolution() {
@Override
public void solveLine(String line) {
String[] lineDicts = line.split("\ttimes: ");
if(lineDicts != null){
for(int i=0; i<lineDicts.length; i++){
String word = lineDicts[i];
if(dictFreq.containsKey(word)){
int num = dictFreq.get(word);
dictFreq.put(word, ++num);
}else {
dictFreq.put(word, 1);
}
}
}
}
}, "2-gram分词统计.txt");
for(Map.Entry<String,Integer> entry : dictFreq.entrySet()) {
if(entry.getKey()!=null) {
if(word.equals(entry.getKey())) {
System.out.println("该词是双字词! "+"出现次数"+entry.getValue());
return true;
}
}
}
return false;
}
public static void main(String[] args) {
long begin0=System.currentTimeMillis();
System.out.println("输入字词:");
Scanner in =new Scanner(System.in);
String word=in.nextLine();
if(!isOneGram(word))
if(!isTwoGram(word)) {
System.out.println("找不到!请重新输入");
}
long end0 = System.currentTimeMillis();
System.out.println("执行耗时:" + (end0 - begin0) + " 豪秒"+'\n');
}
}
Tongji.java
package 实验三;
import java.io.*;
import java.util.*;
public class TongJi {
public static void gram_1() {
final HashMap<String, Integer> wordDictFreqency=new HashMap<>();
final HashMap<String, Integer> propertyDictFreqency=new HashMap<>();
FileOperate.readInput(new LineSolution() {
@Override
public void solveLine(String line) {
//如 [上海/ns 国脉/nz 通讯/n 股份/n 有限公司/n]nt
line=line.replace("[", "/");
line=line.replace("]", "/");
String []lineSplit=line.split(" ");
if(lineSplit!=null) {
for(int i=0;i<lineSplit.length;i++) {
String []wordSplit=lineSplit[i].split("/");
String word="";
String property="";
for(String s:wordSplit) {
if(ChineseAndEnglish.isChinese(s)) {
word+=s;
}else if(ChineseAndEnglish.isEnglish(s)) {
property+=s+"->";
}
}
//字词字典
if(wordDictFreqency.containsKey(word)){
int value=wordDictFreqency.get(word);
wordDictFreqency.put(word, value+1);
}else wordDictFreqency.put(word, 1);
//词性字典
if(propertyDictFreqency.containsKey(property)){
int value=propertyDictFreqency.get(property);
propertyDictFreqency.put(property, value+1);
}else propertyDictFreqency.put(property, 1);
}
}
}
},"词性标注%40人民日报199801.txt");
int sum=0;
int count=0;
String con=null;
String con2=null;
ArrayList<String> list= new ArrayList<String>();
ArrayList<String> list2= new ArrayList<String>();
for(Map.Entry<String, Integer>entry:wordDictFreqency.entrySet()) {
if(entry.getKey()!=null&&!entry.getKey().equals("")) {
con=entry.getKey()+"\ttimes: "+entry.getValue()+"\n";
list.add(con);
sum+=entry.getValue();
}
}
for(Map.Entry<String, Integer>entry:propertyDictFreqency.entrySet()) {
if(entry.getKey()!=null&&!entry.getKey().equals("")/*去掉空值*/) {
con2=entry.getKey()+"\ttimes: "+entry.getValue()+"\n";
list2.add(con2);
count++;
}
}
System.out.println("1-gram词性的种类个数:"+count);
System.out.println("1-gram字词总数:"+sum);
FileOperate.writeWordOutput(1, list);
FileOperate.writePropertyOutput(1, list2);
}
public static void gram_2() {
final HashMap<String, Integer> wordDictFreqency=new HashMap<>();
final HashMap<String, Integer> propertyDictFreqency=new HashMap<>();
FileOperate.readInput(new LineSolution() {
@Override
public void solveLine(String line) {
line=line.replace("[", "/");
line=line.replace("]", "/");
String []lineSplit=line.split(" ");
if(lineSplit!=null) {
for(int i=0;i<lineSplit.length-1;i++) {
String word1=lineSplit[i]+"/"+lineSplit[i+1];
String word2="";
String property="";
String[] wordSplit = word1.split("/");
for(String s:wordSplit) {
if(ChineseAndEnglish.isChinese(s))word2+=s;
else if(ChineseAndEnglish.isEnglish(s))property+=s;
}
if(wordDictFreqency.containsKey(word2)){
int value=wordDictFreqency.get(word2);
wordDictFreqency.put(word2, value+1);
}else wordDictFreqency.put(word2, 1);
//词性字典
if(propertyDictFreqency.containsKey(property)){
int value=propertyDictFreqency.get(property);
propertyDictFreqency.put(property, value+1);
}else propertyDictFreqency.put(property, 1);
}
}
}
},"词性标注%40人民日报199801.txt");
int sum=0;
int count=0;
String con=null;
String con2=null;
ArrayList<String> list= new ArrayList<String>();
ArrayList<String> list2= new ArrayList<String>();
for(Map.Entry<String, Integer>entry:wordDictFreqency.entrySet()) {
if(entry.getKey()!=null&&!entry.getKey().equals("")) {
con=entry.getKey()+"\ttimes: "+entry.getValue()+"\n";
list.add(con);
sum+=entry.getValue();
}
}
for(Map.Entry<String, Integer>entry:propertyDictFreqency.entrySet()) {
if(entry.getKey()!=null&&!entry.getKey().equals("")/*去掉空值*/) {
con2=entry.getKey()+"\ttimes: "+entry.getValue()+"\n";
list2.add(con2);
count++;
}
}
System.out.println("2-gram词性的种类个数:"+count);
System.out.println("2-gram字词总数:"+sum);
FileOperate.writeWordOutput(2, list);
FileOperate.writePropertyOutput(2, list2);
}
public static void main(String[] args) {
long begin0 = System.currentTimeMillis();
gram_1();
gram_2();
//state_p();
//Sentence();
System.out.println("词语统计完毕");
long end0 = System.currentTimeMillis();
System.out.println("执行耗时:" + (end0 - begin0) + " 豪秒"+'\n');
}
}
//处理一行的标注接口
interface LineSolution{
void solveLine(String line);
}
项目总工程下载地址:
2.python
import re
def count_file():
with open('199801.txt', 'r', encoding='utf8') as file:
lines = file.read().split('\n')
mod1 = {}
mod2 = {}
for line in lines:
# 去除头信息
if line[:6] == '199801':
line = line[23:]
words = re.findall('[\u4e00-\u9fa5\uff10-\uff19]+', line)
# 空行
if len(words) == 0:
continue
# 一元
for word in words:
if word in mod1.keys():
mod1[word] += 1
else:
mod1[word] = 1
words = ['<b>'] + words + ['<e>']
# 二元
for i in range(len(words) - 1):
key = (words[i], words[i + 1])
if key in mod2.keys():
mod2[key] += 1
else:
mod2[key] = 1
mod1 = sorted(mod1.items(), key=lambda x: x[1], reverse=True)
mod2 = sorted(mod2.items(), key=lambda x: x[1], reverse=True)
# 输出
with open('mod1.txt', 'w', encoding='utf8') as file:
for k, v in mod1:
file.write(f'{k} {v}\n')
with open('mod2.txt', 'w', encoding='utf8') as file:
for k, v in mod2:
file.write(f'{k} {v}\n')
def select(word,follow=None):
if follow==None:
with open('mod1.txt', 'r', encoding='utf8') as file:
lines = file.read().split('\n')
for line in lines:
k,v=line.split(' ')
if k==word:
return int(v)
else:
with open('mod2.txt', 'r', encoding='utf8') as file:
lines = file.read().split('\n')
for line in lines:
k1,k2, v = line.split(' ',)
if k1+k2 == f"('{word}','{follow}')":
return int(v)
return False
if __name__=='__main__':
count_file()
print(select('企业'))
print(select('企业','改革'))