分词的话,java类的split方法和StringTokenizer类可以进行简单的分词,如果不用NLP的API类时。下文的例子都属于apache的openNLP的方法,案例来源于java自然语言处理这本书
import opennlp.tools.tokenize.SimpleTokenizer;
/**
* Filename: NlpTokenizerDeal.java
* Description: 文本分词标准化处理 -1.字母准变小写;2.缩写词展开;3.去除停用词;4.词干化和词性还原
* Copyright: Copyright (c) 2019 All Rights Reserved.
* @author: wangk
* @version: 1.0
* Create at: 2019年5月6日 上午11:26:53
*
* Modification History:
* Date Author Version Description
* ------------------------------------------------------------------
* 2019年5月6日 wangk 1.0 1.0 Version
*
*/
public class NlpTokenizerDeal {
static String paragraph = "A simple approach to create a class to hold and remove stopwords Let's IBM";
static String chineseLanguage = "第一个括号子表达式捕获 Web 地址的协议部分。该子表达式匹配在冒号和两个正斜杠前面的任何单词。"; //中文可以进行正则匹配每隔字中间加一个空格,就可以进行分词了
static String words[] = {"bank","banking","banks","banker"};
public static void main(String[] args) {
NlpTokenizerDeal to = new NlpTokenizerDeal();
//to.stopWordC(chineseLanguage);
to.porterStemmer(words);
}
/**
* @Description: 转换为小写字母
* @author wangk
* @param text
* @date: 2019年5月6日 上午11:30:33
*/
public void toLowerCase(String text) {
String ret = text.toLowerCase();
System.out.println(ret);
}
/**
* @Description: 停用词
* @author wangk
* @param text
* @date: 2019年5月6日 下午2:02:59
*/
public void stopWordE(String text) {
StopWords stopWords = new StopWords();
//注意大写字母,停用词为小写此处应该住变为小写
text = text.toLowerCase();
SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
String tokens[] = simpleTokenizer.tokenize(text);
String list[] = stopWords.removeEWords(tokens);
for(String word : list) {
System.out.println(word);
}
}
//结果:simple approach create class hold and remove stopwords let ' s ibm
/**
* @Description: 停用词 中文
* @author wangk
* @param text
* @date: 2019年5月6日 下午2:02:59
*/
public void stopWordC(String text) {
StopWords stopWords = new StopWords();
//由于分词工具类采用空格分词,故每隔中文添加空格 试试停用词
String regex = "(.{1})";
text = text.replaceAll (regex, "$1 ");
SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
String tokens[] = simpleTokenizer.tokenize(text);
String list[] = stopWords.removeCWords(tokens);
for(String word : list) {
System.out.println(word);
}
}
//结果:第个括号子表达式捕获Web地址协议部分。子表达式匹配冒号两正斜杠前面任单词。
/**
* @Description: 词干化 得到句子词元 只适用于普通的词缀
* Porter Stemmer 词干分析器官网 https://tartarus.org/martin/PorterStemmer/
* @author wangk
* @param text
* @date: 2019年5月6日 下午3:00:41
* apache opennlp http://svn.apache.org/repos/asf/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/stemmer/PorterStemmer
*/
public void porterStemmer(String words[] ) {
PorterStemmer ps = new PorterStemmer();
for(String word : words) {
String stem = ps.stem(word);
System.out.println("word:"+word+" stem:"+stem);
}
}
/*word:bank stem:bank
word:banking stem:bank
word:banks stem:bank
word:banker stem:banker*/
}
用到的两个类,停用词类,和词干性还原类
package com.npl.demo.utils;
/**
*
* Stemmer, implementing the Porter Stemming Algorithm
*
* The Stemmer class transforms a word into its root form. The input
* word can be provided a character at time (by calling add()), or at once
* by calling one of the various stem(something) methods.
*/
public class PorterStemmer {
private char[] b;
private int i, /* offset into b */
j, k, k0;
private boolean dirty = false;
pr

最低0.47元/天 解锁文章
1016

被折叠的 条评论
为什么被折叠?



