记得有一段时间因为想过要考研,当时要记单词,当然我想要记一些常用的单词,但是哪些单词是常用的呢?现在外面有很多的分频词汇的册子,我也买过,但是总是不放心.于是决定自己写一个程序来统计一下单词的出现频率.这个程序也是那天晚上写的,还比较管用,我们只要把要分析的英文文章放到一个指定的目录下面,它就可以自动的去统计这个目录下面的所有英文资料中各个单词出现的频率并排序后输出.你不妨试试看哦,哈哈.编程是一种乐趣.
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.File;
import java.util.
import java.io.IOException;
import java.io.File;
import java.util.Enumeration;
import java.util.Hashtable;
/**
* This class is designed to contain word and it's frequency.
* There is also some useful method.
* @author ZhuTao HUST 2006.6.2-2006.6.3
* Email:greenkugua@sina.com.cn.
* QQ:307356132
* @version:1.0
*/
class Set{
private int num;//The num of the word it already contained.
private int []times;
private String[] word;
public Set(int size){
this.num = 0;
this.times = new int[size];
this.word = new String[size];
}
/**
* This mothod is used to add element into the Set.
* @param word is the word you want to add.
* @param times is the times it appears.
*/
public void addElement(String word,int times)
{
this.times[num] = times;
this.word[num] = word;
num++;
}
/**
* This method is used to sort word by frequency.
*/
public void sort()
{
for(int i = 0;i<this.num;i++)
{
int num = this.times[i];
for(int j = i+1;j<this.num;j++)
{//冒泡排序;
if(this.times[j]>num)
{
num = this.times[j];
String word = this.word[i];
int times = this.times[i];
this.times[i] = this.times[j];
this.word[i] = this.word[j];
this.times[j] = times;
this.word[j] = word;
}
}
}
}
/**
* This method is designed to show the result.
*/
public void showResult()
{
System.out.println("总共有" +this.num+"个单词,它们的出现频率降序排列如下:");
for(int i = 0;i<this.num;i++)
System.out.println(this.word[i]+" : "+this.times[i]);
}
/**
* This method is designed to get the number of words.
* @return The number of words.
*/
public int getCount()
{
return this.num;
}
/**
* This method is designed to get word at number i.
* @param i is the number of word you want to get.
* @return the word[i].
*/
public String getWordAt(int i)
{
return this.word[i];
}
/**
* This method is designed to get frequency of word[i];
* @param i is the number of word.
* @return the frequency of word[i].
*/
public int getFrequency(int i)
{
return this.times[i];
}
}
/**
* This class is designed to annalyse
* word's frequency of English articles.
* @author ZhuTao HUST.
* 2006.6.2-2006.6.3
* Email:greenkugua@sina.com.cn.
* QQ:307356132
* @version:1.0
*/
public class EnglishWord {
private Set resultSet;
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
EnglishWord analyser = new EnglishWord("G://test","G://tao.txt");
}
public EnglishWord(String filepath,String savepath)
{
String content = this.readFile(filepath);
Hashtable table = this.getWordList(content);
this.initSet(table);
this.resultSet.sort();
this.resultSet.showResult();
this.saveResult(savepath);
}
/**
* This method is used to analyse the times of each word appears in
* the file.
* @param content is the content of file you.
* @return the word map.Which contains words and times it appeared.
*/
public Hashtable getWordList(String content)
{
Hashtable wordList = new Hashtable();
int i = 0;
for(;i<content.length();i++)
{
char ch = content.charAt(i);
if((ch>'Z'&&ch<'a')||ch<'A'||ch>'z');
else break;
}
boolean flag = true;
String word = new String();
char ch ;
for(;i<content.length();i++)
{
ch = content.charAt(i);
if((ch>='A'&&ch<='Z')||(ch>='a'&&ch<='z')||ch == '/''){
word+=ch;
flag = true;
}
else{
//如果已经包含该单词,就计数加一;
if(ch == '-'){i+=2;continue;}
if(flag)
{
if(wordList.containsKey(word))
{
Integer num = (Integer)wordList.get(word);
int t = num.intValue()+1;
wordList.put(word,new Integer(t));
}
else
{
wordList.put(word,new Integer(1));
}
}
flag = false;
word = "";
}
}
if(!word.equals(""))
{
if(wordList.containsKey(word))
{
Integer num = (Integer)wordList.get(word);
int t = num.intValue()+1;
wordList.put(word,new Integer(t));
}
else
{
wordList.put(word,new Integer(1));
}
}
return wordList;
}
/**
* @see This method is used to read content from file.
* @param String filepath is the path and name of the file
* which you want to analyse.
* @return return the content of file in String form.
*/
public String readFile(String filepath)
{
try{
File file= new File(filepath);
if(file.isDirectory())
{
String[] list = file.list();
String str = new String();
System.out.println("文件的个数:"+list.length+" 文件列表如下:");
for(int i =0;i<list.length;i++)
{
System.out.println(list[i]);
FileInputStream read = new FileInputStream(filepath+'//'+list[i]);
byte[]data = new byte[read.available()];
read.read(data);
read.close();
String content = new String(data);
str +=content;
}
return str;
}
else{
FileInputStream read = new FileInputStream(filepath);
byte[]data = new byte[read.available()];
read.read(data);
read.close();
String content = new String(data);
return content;
}
}catch(IOException e){
System.out.println(e);
return null;
}
}
/**
* This method is designed to init a set of word and the times it appeared.
* @param wordList
*/
public void initSet(Hashtable wordList)
{
Enumeration e = wordList.keys();
this.resultSet = new Set(wordList.size());
while(e.hasMoreElements())
{
Object key = e.nextElement();
String word = key.toString();
Integer num = (Integer)wordList.get(key);
int times = num.intValue();
this.resultSet.addElement(word,times);
}
}
/**
* This method is designed to show the word list.
* @param wordList
*/
public void showTable(Hashtable wordList)
{
Enumeration e = wordList.keys();
while(e.hasMoreElements())
{
Object key = e.nextElement();
String word = key.toString();
Integer num = (Integer)wordList.get(key);
int times = num.intValue();
System.out.println(word+" : "+times);
}
}
/**
* This method is designed to write the result into file.
* @param filepath
*/
public void saveResult(String filepath)
{
try{
FileOutputStream write = new FileOutputStream(filepath);
for(int i = 0;i<this.resultSet.getCount();i++)
{
String word = this.resultSet.getWordAt(i);
int times = this.resultSet.getFrequency(i);
write.write(this.format(word,times));
}
write.close();
}catch(IOException e)
{
System.out.println(e);
}
}
/**@author ZhuTao
* This method is designed to format information
* to byte stream.
* @param word
* @param times
* @return a byte stream.
*/
public byte[] format(String word,int times)
{
St