转载:https://www.cnblogs.com/JustIsQiGe/p/8006734.html
在Spark中使用ansj分词先要将ansj_seg-5.1.1.jar和nlp-lang-1.7.2.jar加入工程
ansj源码github:https://github.com/NLPchina/ansj_seg
ansj下载链接:https://oss.sonatype.org/content/repositories/releases/org/ansj/ansj_seg/
nlp-lang下载链接:https://oss.sonatype.org/content/repositories/releases/org/nlpcn/nlp-lang/
package com.spark.test import org.apache.spark.sql.SparkSession import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import scala.io.Source import org.ansj.splitWord.analysis.DicAnalysis import org.ansj.library.DicLibrary import org.ansj.recognition.impl.StopRecognition import org.nlpcn.commons.lang.tire.library import java.util.Arrays object Participle { case class Movies(productId:String,userId:String,profileName:String, helpfulness:String,score:String,time:String,summary:String,text:String) def main(args: Array[String]){ Logger.getLogger("org.apache.spark").setLevel(Level.WARN) Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) //分词准备 val stop = new StopRecognition() stop.insertStopNatures("w")//过滤掉标点 stop.insertStopNatures("m")//过滤掉m词性 stop.insertStopNatures("null")//过滤null词性 stop.insertStopNatures("<br />")//过滤<br />词性 stop.insertStopNatures(":") stop.insertStopNatures("'") val spark = SparkSession.builder().master("local[4]").appName("prepare").getOrCreate() val data = spark.sparkContext.textFile("/Users/yangyang/Desktop/b.txt") import spark.implicits._ val splits = data.filter(line => !line.contains("4.0")).map{x => val fields = x.split("\t") if(fields(4).toString <= "3.0"){ fields(4) = "0" }else if(fields(4).toString == "5.0"){ fields(4) = "1" } fields(0)+"\t"+fields(1)+"\t"+fields(2)+"\t"+fields(3)+"\t"+fields(4)+"\t"+fields(5)+"\t"+fields(6)+"\t"+fields(7) } //生成训练数据集 val trains = splits.map(_.split("\t")).map(x => Movies(x(0).toString,x(1).toString,x(2).toString,x(3).toString,x(4).toString,x(5).toString,x(6).toString,x(7).toString)).toDF() //trains.show() trains.createOrReplaceTempView("train") val doc = spark.sql("select text from train").rdd // println(doc) // val testsentence = DicAnalysis.parse("好喜欢《武林外传》这部电视剧!").recognition(stop).toStringWithOutNature("|") // println(testsentence) //去掉逗号、句号等 val splited = doc.map{ x => val str = x.toString() DicAnalysis.parse(str).recognition(stop).toStringWithOutNature("|") }.saveAsTextFile("/Users/XXXXX/Desktop/c") //splited.show() //.saveAsTextFile("/Users/XXXXX/Desktop/c") // println(splited) spark.close() } } 部分参考:http://m.blog.youkuaiyun.com/ozinco/article/details/70184347