package stastic;
import java.util.*;
import java.util.regex.*;
import java.io.*;
/**
* @Author:wangwei
* @Create on :2008-1-19
*/
public class StatisticsWord {
public static void main(String args[]) throws IOException {
BufferedReader buf = new BufferedReader(new FileReader(
"f://article.txt")); // 假设文章英语article.txt存于F盘下
StringBuffer sbuf = new StringBuffer();
String line = null;
while ((line = buf.readLine()) != null) {
sbuf.append(line);// 追加到缓冲字符串中
}
buf.close();
Pattern expression = Pattern.compile("[a-zA-Z]+"); // 使用正则表达式匹配单词
String string1 = sbuf.toString().toLowerCase();
Matcher matcher = expression.matcher(string1);
TreeMap myTreeMap = new TreeMap();// 创建树映射 存放键/值对
int n = 0; // 单词总数
Object word = null; // 文章中的单词
Object num = null; // 出现的次数
while (matcher.find()) {
word = matcher.group();
if (word.equals("a") || word.equals("an") || word.equals("the"))
continue;
n++;
if (myTreeMap.containsKey(word)) {// 过滤重复单词
num = myTreeMap.get(word);// 得到单词出现的次数
Integer count = (Integer) num;
myTreeMap.put(word, new Integer(count.intValue() + 1));
} else {
myTreeMap.put(word, new Integer(1));// 第一次出现的单词添加到映射中
}
}
Iterator iter = myTreeMap.keySet().iterator();// 得到树映射键集合的迭代器
Object key = null;
Integer countWord[] = new Integer[n];
String wordArray[] = new String[n];
int index = 0;
while (iter.hasNext()) {// 使用迭代器遍历树映射的键
key = iter.next();
countWord[index++] = (Integer) myTreeMap.get(key);
wordArray[index - 1] = (String) key;
}
// 按出现频率高低排序
for (int i = 0; i < (index - 1); i++) // 不同单词的总数为index-1;
{
for (int j = i + 1; j < index; j++) {
if ((countWord[j].compareTo(countWord[i])) > 0) {
Integer temp = countWord[i];
countWord[i] = countWord[j];
countWord[j] = temp;
}
}
}
BufferedWriter bufw = new BufferedWriter(new FileWriter(
"f://result.txt"));// 保存结果到F盘下result.txt中
bufw.write("该文章频率最高的三个单词为:");
bufw.newLine();
System.out.println("该文章频率最高的三个单词为:");
for (int cnt = 0; cnt < 3; cnt++) {
System.out.println(wordArray[index - cnt - 1] + " 单词个数为:"
+ countWord[cnt]);
bufw.write(wordArray[index - cnt - 1] + " 单词个数为:" + countWord[cnt]);
bufw.newLine();
}
bufw.close();
}
}