/*
*michzel new java files
*
*Created on 2010-10-2
*
*Copyright 2010 Anchora info company. all rights reserved
*/
package TFIDF;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.wltea.analyzer.IKSegmentation;
import org.wltea.analyzer.Lexeme;
public class IKtf
{
public static void main(String[] args)
{
String filepathfrom=System.getProperty("user.dir")+"//南宋生活顾问1.txt";
String filepathto=System.getProperty("user.dir")+"//resulttest.txt";
String text=TextManager.Read(filepathfrom);
List<WordsCounter> wordsCountList=new ArrayList<WordsCounter>();
List<String> wordsList=new ArrayList<String>();
//分词部分代码
System.out.println(text);
IKSegmentation ikSeg = new IKSegmentation(new StringReader(text) , false);
try
{
Lexeme l = null;
while( (l = ikSeg.next()) != null)
{
System.out.println(l);
wordsList.add(l.getLexemeText());
System.out.println(+wordsList.size());
}
}
catch (IOException e)
{
e.printStackTrace();
}
System.out.println("***************");
//统计词汇频数
for(String word:wordsList)
{
boolean match=false;
for(int i=0;i<wordsCountList.size();i++)
{
if(word.equals(wordsCountList.get(i).text))
{
wordsCountList.get(i).count++;
match=true;
break;
}
}
if(match==false)
{
wordsCountList.add(new WordsCounter(word,1));
}
}
//将统计结果写入文本文档
String resultString="";
for(WordsCounter wordCounter:wordsCountList)
{
resultString+=wordCounter.text+":"+wordCounter.count+"/r/n";
System.out.println(wordCounter.text+":"+wordCounter.count);
double tf= (double) wordCounter.count/wordsList.size();
System.out.println(+tf);
}
TextManager.Write(filepathto,resultString);
}
}
词频 term frequency
最新推荐文章于 2024-07-15 22:53:43 发布