1、数据准备
20W人工标注文本数据,样本如下:
1#k-v#*亮亮爱宠*波波宠物指甲钳指甲剪附送锉刀适用小型犬及猫特价
1#k-v#*顺丰包邮*宠物药品圣马利诺PowerIgG免疫力球蛋白犬猫细小病毒
1#k-v#*包邮*法国罗斯蔓草本精华宠物浴液薰衣草护色润泽香波拍套餐
1#k-v#*包邮*家朵102宠物沐浴液
1#k-v#*包邮*家朵102宠物沐浴液猫
2、分词
使用ansj包对文本数据去除停用词分词。代码如下:
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
public class Seg{
private static Set<String> stopwords = new HashSet<String>();
static{
File f = new File("");
try {
List<String> lines = FileUtils.readLines(f);
for(String str : lines){
stopwords.add(str);
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
File f = new File("");
File resultFile = new File("");
List<String> lists = FileUtils.readLines(f);
int count = 0;
for(String str : lists){
count++;
String index = str.split("#k-v#")[0];
// System.out.println(count + " " + Integer.parseInt(index));
Result res = ToAnalysis.parse(str.split("#k-v#")[1]);
List<Term> terms = res.getTerms();
String wordStr = "";
for(Term t : terms){
String word = t.getName();
if(word.length()>1&&!stopwords.contains(word)){
wordStr = wordStr + " " + word;
}
}
if(!StringUtils.isEmpty(wordStr)){
FileUtils.write(resultFile, index + "#k-v#" + wordStr + "\n" , true);
}