+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++cp1
//书中代码示例
https://github.com/sryza/aas
//spark-docs
http://spark.apache.org/docs/
//IDEA-scala-spark 版本问题
https://blog.youkuaiyun.com/ningyanggege/article/details/86526984
//Spark SQL, DataFrames 以及 Datasets 编程指南
http://ifeve.com/spark-sql-dataframes/?spm=a2c4e.11153940.blogcont336314.10.7f772e71J4g83O
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++cp2 用Scala和Spark进行数据分析
//2.4 Spark shell和SparkContext示例
sudo mkdir -p sparkS/linkage
sudo chown zieox:zieox linkage
curl -L -o donation.zip https://bit.ly/1Aoywaq
unzip donation.zip
unzip 'block_*.zip'
//将数据存储到hdfs
hadoop fs -mkdir linkage
hadoop fs -put block_*.csv /user/zieox/linkage
//在集群上启动spark
spark-shell --master yarn --deploy-mode client
//本地启动spark 以最大线程启动
spark-shell --master local[*]
---------------------------------
WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
https://www.cnblogs.com/tijun/p/7562282.html
WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
http://www.bubuko.com/infodetail-2022126.html
---------------------------------
//创建简单RDD
val rdd = sc.parallelize(Array(1,2,2,4),4)
//读取
val rdd2 = sc.textFile("hdfs:user/hduser/test/README.txt")
val rdd3 = sc.textFile("hdfs:user/zieox/linkage")
rdd3.first
val head = rdd3.take(10)
//动作
rdd3.count()
rdd3.collect() //返回一个RDD中多有对象的Array
rdd3.saveAsFile("hdfs:user/zieox/results") //将RDD内容保存到hdfs(持久化存储)
def isHeader(line:String)=line.contains("id_1")
head.filter(isHeader).foreach(println)
def i(a:String,b:String)=a.contains(b)
scala> head.filter(a=>i(a,"39086")).foreach(println) --->39086,47614,1,?,1,?,1,1,1,1,1,TRUE
//filterNot示例
head.filterNot(isHeader).length
head.filter(x=> !isHeader(x)).length
head.filter(!isHeader(_)).length
//2.6 把代码 从客户端发送到集群
val noh = head.filter(!isHeader(_))
//2.7 从RDD到DataFrame
spark.sparkContext
scala> val prev = spark.read.csv("linkage")
prev: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 10 more fields]
//使用CSV读取器
https://github.com/databricks/spark-csv#features
val parsed = spark.read.option("header","true").option("nullValue","?").option("inferSchema","true").csv("linkage")
parsed.printSchema
//显示DataFrame前几行
parsed.show
//DataFrame 与数据格式
json
parquet & orc
jdbc
libsvm
text
//示例:从json文件中加载数据
val d1 = spark.read.format("json").load("file.json")
val d2 = spark.read.json("file.json")
//示例:将文件存为parquet格式
d1.write.format("parquet").save("file.parquet")
d1.write.parquet("file.parquet")
//默认情况下,如果目标文件已存在,saprk会抛出一个错误信息,我们可以选择 overwrite(覆盖),append(末尾追加),Ignore(跳过)
d2.write.mode(SaveMode.Ignore).parquet("file.parquet")
//2.8 使用DataFrameAPI来分析数据
scala> parsed.count --->res41: Long = 5749132
//cache:当我们执行另外一个操作时,spark会反复执行读取及解析操作,即使我们已经从数据集中过滤出数据,或则对原始数据集做过聚合。这种方式浪费计算机资源,所以数据一旦被解析完,我们就可以把解析后的数据保存在集群中,use cache
parsed.cache()
//缓存:cache() 是 persist(StorageLevel.Memory)的简写,它将所有Row对象存储为为序列化的Java对象
parsed.rdd.map(_.getAs[Boolean]("is_match")).countByValue()
//使用DataFrameAPI
**select is_match,count(*) as count from linkage group by is_match order by count desc**
parsed.groupBy("is_match").count().orderBy($"count".desc).show()
//parsed.groupBy("cmp_plz").sum().show()
//DataFrame的聚合函数
parsed.agg(avg($"cmp_sex"),stddev($"cmp_sex")).show()
//创建临时表 使用sparkSQL
parsed.createOrReplaceTempView("linkage")
spark.sql("""select is_match,count(*) as count from linkage group by is_match order by count desc""")
//SparkSQL与Hive的连接
import org.apache.spark.sql.SparkSession
val sparkSession = SparkSession.builder.master("local[4]").enableHiveSupport().getOrCreate()
//2.9 DataFrame的统计信息
val summary = parsed.describe()
summary.select("id_1","summary","cmp_lname_c1").show()
val matches = parsed.where("is_match=true")
val matchSummary=matches.describe()
val misses = parsed.filter($"is_match" === false)
val missSummary = misses.describe()
//2.10 DataFrame的转置和重塑
summary.printSchema()
val schema = summary.schema
//长表转宽表
val longForm=summary.flatMap(row=>{
val metric = row.getString(0)
(1 until row.size).map(i =>{(metric,schema(i).name,row.getString(i).toDouble)})
})
val longDF = longForm.toDF("metric","field","value")
longDF.show()
val wideDF = longDF.groupBy("field").pivot("metric",Seq("count","mean","stddev","min","max")).agg(first("value"))
wideDF.select("field","count","mean").show()
// 实现环境函数
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.first
def pivotSummary(desc:DataFrame):DataFrame={
val schema = desc.schema
import desc.sparkSession.implicits._
val lf = desc.flatMap( row => {
val metric = row.getString(0)
(1 until row.size).map(i=>{(metric,schema(i).name,row.getString(i).toDouble)})
}).toDF("metric","field","value")
lf.groupBy("field").pivot("metric",Seq("count","mean","stddev","min","max")).agg(first("value"))
}
val matchSummaryT = pivotSummary(matchSummary)
val missSummaryT = pivotSummary(missSummary)
//2.11 DataFrame的连接和特征选择
matchSummaryT.createOrReplaceTempView("match_desc")
missSummaryT.createOrReplaceTempView("miss_desc")
spark.sql("""select a.field,a.count+b.count total,a.mean-b.mean delta from match_desc a inner join miss_desc b on a.field=b.field where a.field not in ("id_1","id_2") order by delta desc,total desc""").show()
//2.12 为生产环境准备模型
case class MatchData(
id_1: Int,
id_2: Int,
cmp_fname_c1: Option[Double],
cmp_fname_c2: Option[Double],
cmp_lname_c1: Option[Double],
cmp_lname_c2: Option[Double],
cmp_sex: Option[Int],
cmp_bd: Option[Int],
cmp_bm: Option[Int],
cmp_by: Option[Int],
cmp_plz: Option[Int],
is_match: Option[Int]
)
//将parsed转换为Dataset[MatchData]
val matchData = parsed.as[MatchData]
MatchData.show()
case class Score(value:Double){def +(oi:Option[Int]) ={Score(value + oi.getOrElse(0))}}
def scoreMatchData(md:MatchData):Double = {(Score(md.cmp_lname_c1.getOrElse(0.0))+md.cmp_plz+md.cmp_by+md.cmp_bd+md.cmp_bm).value}
val scored = matchData.map{md => (scoreMatchData(md),md.is_match)}.toDF("score","is_match")
//2.13 评估模型
def crossTabs(scored: DataFrame, t: Double): DataFrame = {
scored.
selectExpr(s"score >= $t as above", "is_match").
groupBy("above").
pivot("is_match", Seq("true", "false")).
count()
}
=====================================================================================================================
package com.cloudera.datascience.intro
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.functions._ // for lit(), first(), etc.
case class MatchData(
id_1: Int,
id_2: Int,
cmp_fname_c1: Option[Double],
cmp_fname_c2: Option[Double],
cmp_lname_c1: Option[Double],
cmp_lname_c2: Option[Double],
cmp_sex: Option[Int],
cmp_bd: Option[Int],
cmp_bm: Option[Int],
cmp_by: Option[Int],
cmp_plz: Option[Int],
is_match: Boolean
)
object RunIntro extends Serializable {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("Intro").getOrCreate
import spark.implicits._
val preview = spark.read.csv("hdfs:///user/ds/linkage")
preview.show()
preview.printSchema()
val parsed = spark.read
.option("header", "true")
.option("nullValue", "?")
.option("inferSchema", "true")
.csv("hdfs:///user/ds/linkage")
parsed.show()
parsed.printSchema()
parsed.count()
parsed.cache()
parsed.groupBy("is_match").count().orderBy($"count".desc).show()
parsed.createOrReplaceTempView("linkage")
spark.sql("""SELECT is_match, COUNT(*) cnt FROM linkage GROUP BY is_match ORDER BY cnt DESC""").show()
val summary = parsed.describe()
summary.show()
summary.select("summary", "cmp_fname_c1", "cmp_fname_c2").show()
val matches = parsed.where("is_match = true")
val misses = parsed.filter($"is_match" === false)
val matchSummary = matches.describe()
val missSummary = misses.describe()
val matchSummaryT = pivotSummary(matchSummary)
val missSummaryT = pivotSummary(missSummary)
matchSummaryT.createOrReplaceTempView("match_desc")
missSummaryT.createOrReplaceTempView("miss_desc")
spark.sql("""SELECT a.field, a.count + b.count total, a.mean - b.mean delta FROM match_desc a INNER JOIN miss_desc b ON a.field = b.field ORDER BY delta DESC, total DESC""").show()
val matchData = parsed.as[MatchData]
val scored = matchData.map { md =>(scoreMatchData(md), md.is_match)}.toDF("score", "is_match") ; crossTabs(scored, 4.0).show()}
def crossTabs(scored: DataFrame, t: Double): DataFrame = {
scored.
selectExpr(s"score >= $t as above", "is_match").
groupBy("above").
pivot("is_match", Seq("true", "false")).
count()
}
case class Score(value: Double) {def +(oi: Option[Int]) = {Score(value + oi.getOrElse(0))}}
def scoreMatchData(md: MatchData): Double = {(Score(md.cmp_lname_c1.getOrElse(0.0)) + md.cmp_plz +md.cmp_by + md.cmp_bd + md.cmp_bm).value}
def pivotSummary(desc: DataFrame): DataFrame = {
val lf = longForm(desc)
lf.groupBy("field").pivot("metric", Seq("count", "mean", "stddev", "min", "max")).agg(first("value"))
}
def longForm(desc: DataFrame): DataFrame = {
import desc.sparkSession.implicits._ // For toDF RDD -> DataFrame conversion
val columns = desc.schema.map(_.name)
desc.flatMap(row => {
val metric = row.getAs[String](columns.head)
columns.tail.map(columnName => (metric, columnName, row.getAs[String](columnName).toDouble))
}).toDF("metric", "field", "value")
}
}
=====================================================================================================================
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++cp3 音乐推荐和Audioscrobbler 数据集
//blog
https://www.cnblogs.com/mr-totoro/p/5775759.html
//3.1 数据集
https://www.etud.iro.umontreal.ca/~bergstrj/audioscrobbler_data.html
wget http://www.iro.umontreal.ca/~lisa/datasets/profiledata_06-May-2005.tar.gz
//3.2 交替最小二乘推荐算法
https://en.wikipedia.org/wiki/Collaborative_filtering
//3.3 准备数据
hadoop fs -put user_artist_data.txt /user/zieox/
//读取数据
val rawUserArtistData = sc.textFile("user_artist_data.txt")
rawUserArtistData.take(5).foreach(println)
//数据归集
val userArtistDF = rawUserArtistData.map{ line =>
val Array(user,artist,_*) = line.split(' ')
(user.toInt,artist.toInt)
}.toDF("user","artist")
userArtistDF.agg(min("user"),max("user"),min("artist"),max("artist")).show()
val rawArtistData = sc.textFile("artist_data.txt")
val artistByID = rawArtistData.flatMap { line =>
val (id, name) = line.span(_ != '\t') ; if (name.isEmpty) {None} else {try {Some((id.toInt, name.trim))} catch {case _ : NumberFormatException => None}}
}.toDF("id","name")
val rawArtistAlias = spark.read.textFile("artist_alias.txt")
val artistAlias = rawArtistAlias.flatMap{
line => val Array(artist,alias) = line.split('\t') ; if (artist.isEmpty){None} else {Some((artist.toInt,alias.toInt))}
}.collect().toMap
artistAlias.head
artistByID.filter($"id" isin (1208690,1003926)).show()
//3.4 构建第一个模型
import org.apache.spark.sql._
import org.apache.spark.broadcast._
def buildCounts(rawUserArtistData: org.apache.spark.rdd.RDD[String] /*Dataset[String]*/,bArtistAlias: Broadcast[Map[Int,Int]]): DataFrame = {
rawUserArtistData.map { line =>
val Array(userID, artistID, count) = line.split(' ').map(_.toInt)
val finalArtistID = bArtistAlias.value.getOrElse(artistID, artistID)
(userID, finalArtistID, count)
}.toDF("user", "artist", "count")
}
val bArtistAlias = spark.sparkContext.broadcast(artistAlias)
val trainData = buildCounts(rawUserArtistData,bArtistAlias)
trainData.cache
//广播变量
val dict : Seq[String] = ...
val bDict = spark.sparkContext.broadcast(dict)
def query(path:String) = { spark.read.textFile(path).map(score(_,bDict.value)) ...}
import org.apache.spark.ml.recommendation._
import scala.util.Random
val model = new org.apache.spark.ml.recommendation.ALS().
setSeed(Random.nextLong()).
setImplicitPrefs(true).
setRank(10).
setRegParam(0.01).
setAlpha(1.0).
setMaxIter(5).
setUserCol("user").
setItemCol("artist").
setRatingCol("count").
setPredictionCol("prediction").
fit(trainData)
model.userFactors.show(1)
//3.5 挨个检查推荐结果
val userID = 2093760
val existingArtistIDs = trainData.filter($"user"===userID).select("artist").as[Int].collect()
artistByID.filter($"id" isin (excitingArtistIDs:_*)).show()
//推荐函数
def makeRecommendations(model:ALSModel,userID:Int,howMany:Int):DataFrame={
val toRecommend=model.itemFactors.select($"id".as("artist")).withColumn("user",lit(userID))
model.transform(toRecommend).select("artist","prediction").orderBy($"prediction".desc).limit(howMany)
}
//做出推荐
val topRecommendations = makeRecommendations(model,userID,5)
topRecommendations.show()
val recommendedArtistIDs = topRecommendations.select("artist").as[Int].collect()
artistByID.filter($"id" isin (recommendedArtistIDs:_*)).show()
//3.6 评价推荐质量
//3.7 计算AUC
def areaUnderCurve(positiveData: DataFrame,bAllArtistIDs: Broadcast[Array[Int]],predictFunction: (DataFrame => DataFrame)): Double = { ... }
val allData = buildCounts(rawUserArtistData,bArtistAlias)
val Array(trainData,cvData) = allData.randomSplit(Array(0.9,0.1))
trainData.cache
cvData.cache
val allArtistIDs = allData.select("artist").as[Int].distinct().collect()
val bAllArtistIDs = spark.sparkContext.broadcast(allArtistIDs)
val model = new ALS().
setSeed(Random.nextLong()).
setImplicitPrefs(true).
setRank(rank).setRegParam(regParam).
setAlpha(alpha).setMaxIter(20).
setUserCol("user").setItemCol("artist").
setRatingCol("count").setPredictionCol("prediction").
fit(trainData)
areaUnderCurve(cvData,bAllArtistIDs,model.transform)
//预测函数
def predictMostListened(train: DataFrame)(allData: DataFrame): DataFrame = {
val listenCounts = train.groupBy("artist").agg(sum("count").as("prediction")).select("artist", "prediction")
allData.join(listenCounts, Seq("artist"), "left_outer").select("user", "artist", "prediction")
}
//3.8 选择超参数
setRank(10) 模型的潜在因素个数(一般来说是矩阵的阶)
sexMaxIter(5) 举证分解迭代的次数,++
setRegParam(0.01) 标准的过拟合参数,通常也称作lambda,值越大越不容易产生过拟合,但值太大会降低分解的准确率
setAlpha(1.0) 控制举证分解时,被观察到“用户-产品”交互相对没被观察到的交互的权重
val evaluations =
for (rank <- Seq(5, 30);regParam <- Seq(1.0, 0.0001);alpha <- Seq(1.0, 40.0))
yield {
val model = new ALS().
setSeed(Random.nextLong()).
setImplicitPrefs(true).
setRank(rank).setRegParam(regParam).
setAlpha(alpha).setMaxIter(20).
setUserCol("user").setItemCol("artist").
setRatingCol("count").setPredictionCol("prediction").
fit(trainData)
val auc = areaUnderCurve(cvData, bAllArtistIDs, model.transform)
model.userFactors.unpersist()
model.itemFactors.unpersist()
(auc, (rank, regParam, alpha))
}
evaluations.sorted.reverse.foreach(println)
//3.9 产生推荐
val someUsers = allData.select("user").as[Int].distinct().take(100)
val someRecommendations = somwUsers.map(userID => (userID,makeRecommendations(model,userID,5)))
someRecommendations.foreach{case (userID,recsDF) =>
val recommendedArtists = recsDF.select("artist").as[Int].collect()
println(s"$user_id -> ${recommendedArtists.mkString(",")}")
}
-----------------------------------------------------------------------------------------------------------------------
package com.cloudera.datascience.recommender
import scala.collection.Map
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.recommendation.{ALS, ALSModel}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.functions._
object RunRecommender {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().getOrCreate()
// Optional, but may help avoid errors due to long lineage
spark.sparkContext.setCheckpointDir("hdfs:///tmp/")
val base = "hdfs:///user/ds/"
val rawUserArtistData = spark.read.textFile(base + "user_artist_data.txt")
val rawArtistData = spark.read.textFile(base + "artist_data.txt")
val rawArtistAlias = spark.read.textFile(base + "artist_alias.txt")
val runRecommender = new RunRecommender(spark)
runRecommender.preparation(rawUserArtistData, rawArtistData, rawArtistAlias)
runRecommender.model(rawUserArtistData, rawArtistData, rawArtistAlias)
runRecommender.evaluate(rawUserArtistData, rawArtistAlias)
runRecommender.recommend(rawUserArtistData, rawArtistData, rawArtistAlias)
}
}
class RunRecommender(private val spark: SparkSession) {
import spark.implicits._
def preparation(
rawUserArtistData: Dataset[String],
rawArtistData: Dataset[String],
rawArtistAlias: Dataset[String]): Unit = {
rawUserArtistData.take(5).foreach(println)
val userArtistDF = rawUserArtistData.map { line =>
val Array(user, artist, _*) = line.split(' ')
(user.toInt, artist.toInt)
}.toDF("user", "artist")
userArtistDF.agg(min("user"), max("user"), min("artist"), max("artist")).show()
val artistByID = buildArtistByID(rawArtistData)
val artistAlias = buildArtistAlias(rawArtistAlias)
val (badID, goodID) = artistAlias.head
artistByID.filter($"id" isin (badID, goodID)).show()
}
def model(
rawUserArtistData: Dataset[String],
rawArtistData: Dataset[String],
rawArtistAlias: Dataset[String]): Unit = {
val bArtistAlias = spark.sparkContext.broadcast(buildArtistAlias(rawArtistAlias))
val trainData = buildCounts(rawUserArtistData, bArtistAlias).cache()
val model = new ALS().
setSeed(Random.nextLong()).
setImplicitPrefs(true).
setRank(10).
setRegParam(0.01).
setAlpha(1.0).
setMaxIter(5).
setUserCol("user").
setItemCol("artist").
setRatingCol("count").
setPredictionCol("prediction").
fit(trainData)
trainData.unpersist()
model.userFactors.select("features").show(truncate = false)
val userID = 2093760
val existingArtistIDs = trainData.filter($"user" === userID).select("artist").as[Int].collect()
val artistByID = buildArtistByID(rawArtistData)
artistByID.filter($"id" isin (existingArtistIDs:_*)).show()
val topRecommendations = makeRecommendations(model, userID, 5)
topRecommendations.show()
val recommendedArtistIDs = topRecommendations.select("artist").as[Int].collect()
artistByID.filter($"id" isin (recommendedArtistIDs:_*)).show()
model.userFactors.unpersist()
model.itemFactors.unpersist()
}
def evaluate(rawUserArtistData: Dataset[String],rawArtistAlias: Dataset[String]): Unit = {
val bArtistAlias = spark.sparkContext.broadcast(buildArtistAlias(rawArtistAlias))
val allData = buildCounts(rawUserArtistData, bArtistAlias)
val Array(trainData, cvData) = allData.randomSplit(Array(0.9, 0.1))
trainData.cache()
cvData.cache()
val allArtistIDs = allData.select("artist").as[Int].distinct().collect()
val bAllArtistIDs = spark.sparkContext.broadcast(allArtistIDs)
val mostListenedAUC = areaUnderCurve(cvData, bAllArtistIDs, predictMostListened(trainData))
println(mostListenedAUC)
val evaluations =for (rank <- Seq(5, 30) ; regParam <- Seq(1.0, 0.0001);alpha <- Seq(1.0, 40.0))
yield {
val model = new ALS().
setSeed(Random.nextLong()).
setImplicitPrefs(true).
setRank(rank).setRegParam(regParam).
setAlpha(alpha).setMaxIter(20).
setUserCol("user").setItemCol("artist").
setRatingCol("count").setPredictionCol("prediction").
fit(trainData)
val auc = areaUnderCurve(cvData, bAllArtistIDs, model.transform)
model.userFactors.unpersist()
model.itemFactors.unpersist()
(auc, (rank, regParam, alpha))
}
evaluations.sorted.reverse.foreach(println)
trainData.unpersist()
cvData.unpersist()
}
def recommend(rawUserArtistData: Dataset[String],rawArtistData: Dataset[String],rawArtistAlias: Dataset[String]): Unit = {
val bArtistAlias = spark.sparkContext.broadcast(buildArtistAlias(rawArtistAlias))
val allData = buildCounts(rawUserArtistData, bArtistAlias).cache()
val model = new ALS().
setSeed(Random.nextLong()).
setImplicitPrefs(true).
setRank(10).setRegParam(1.0).setAlpha(40.0).setMaxIter(20).
setUserCol("user").setItemCol("artist").
setRatingCol("count").setPredictionCol("prediction").
fit(allData)
allData.unpersist()
val userID = 2093760
val topRecommendations = makeRecommendations(model, userID, 5)
val recommendedArtistIDs = topRecommendations.select("artist").as[Int].collect()
val artistByID = buildArtistByID(rawArtistData)
artistByID.join(spark.createDataset(recommendedArtistIDs).toDF("id"), "id").select("name").show()
model.userFactors.unpersist()
model.itemFactors.unpersist()
}
def buildArtistByID(rawArtistData: Dataset[String]): DataFrame = {
rawArtistData.flatMap { line =>
val (id, name) = line.span(_ != '\t')
if (name.isEmpty) {None} else {try {Some((id.toInt, name.trim))} catch {case _: NumberFormatException => None}}}.toDF("id", "name")
}
def buildArtistAlias(rawArtistAlias: Dataset[String]): Map[Int,Int] = {
rawArtistAlias.flatMap { line =>
val Array(artist, alias) = line.split('\t')
if (artist.isEmpty) {None} else {Some((artist.toInt, alias.toInt))}}.collect().toMap
}
def buildCounts(rawUserArtistData: Dataset[String],bArtistAlias: Broadcast[Map[Int,Int]]): DataFrame = {
rawUserArtistData.map { line =>
val Array(userID, artistID, count) = line.split(' ').map(_.toInt)
val finalArtistID = bArtistAlias.value.getOrElse(artistID, artistID)
(userID, finalArtistID, count)
}.toDF("user", "artist", "count")
}
def makeRecommendations(model: ALSModel, userID: Int, howMany: Int): DataFrame = {
val toRecommend = model.itemFactors.select($"id".as("artist")).withColumn("user", lit(userID))
model.transform(toRecommend).select("artist", "prediction").orderBy($"prediction".desc).limit(howMany)
}
def areaUnderCurve(positiveData: DataFrame,bAllArtistIDs: Broadcast[Array[Int]],predictFunction: (DataFrame => DataFrame)): Double = {
// What this actually computes is AUC, per user. The result is actually something
// that might be called "mean AUC".
// Take held-out data as the "positive".
// Make predictions for each of them, including a numeric score
val positivePredictions = predictFunction(positiveData.select("user", "artist")).withColumnRenamed("prediction", "positivePrediction")
// BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of
// small AUC problems, and it would be inefficient, when a direct computation is available.
// Create a set of "negative" products for each user. These are randomly chosen
// from among all of the other artists, excluding those that are "positive" for the user.
val negativeData = positiveData.select("user", "artist").as[(Int,Int)].groupByKey { case (user, _) => user }.
flatMapGroups { case (userID, userIDAndPosArtistIDs) =>
val random = new Random()
val posItemIDSet = userIDAndPosArtistIDs.map { case (_, artist) => artist }.toSet
val negative = new ArrayBuffer[Int]()
val allArtistIDs = bAllArtistIDs.value
var i = 0
// Make at most one pass over all artists to avoid an infinite loop.
// Also stop when number of negative equals positive set size
while (i < allArtistIDs.length && negative.size < posItemIDSet.size) {
val artistID = allArtistIDs(random.nextInt(allArtistIDs.length))
// Only add new distinct IDs
if (!posItemIDSet.contains(artistID)) {negative += artistID}
i += 1
}
// Return the set with user ID added back
negative.map(artistID => (userID, artistID))
}.toDF("user", "artist")
// Make predictions on the rest:
val negativePredictions = predictFunction(negativeData).
withColumnRenamed("prediction", "negativePrediction")
// Join positive predictions to negative predictions by user, only.
// This will result in a row for every possible pairing of positive and negative
// predictions within each user.
val joinedPredictions = positivePredictions.join(negativePredictions, "user").select("user", "positivePrediction", "negativePrediction").cache()
// Count the number of pairs per user
val allCounts = joinedPredictions.groupBy("user").agg(count(lit("1")).as("total")).select("user", "total")
// Count the number of correctly ordered pairs per user
val correctCounts = joinedPredictions.
filter($"positivePrediction" > $"negativePrediction").
groupBy("user").agg(count("user").as("correct")).
select("user", "correct")
// Combine these, compute their ratio, and average over all users
val meanAUC = allCounts.join(correctCounts, Seq("user"), "left_outer").
select($"user", (coalesce($"correct", lit(0)) / $"total").as("auc")).
agg(mean("auc")).
as[Double].first()
joinedPredictions.unpersist()
meanAUC
}
def predictMostListened(train: DataFrame)(allData: DataFrame): DataFrame = {
val listenCounts = train.groupBy("artist").agg(sum("count").as("prediction")).select("artist", "prediction")
allData.join(listenCounts, Seq("artist"), "left_outer").select("user", "artist", "prediction")
}
}
-----------------------------------------------------------------------------------------------------------------------
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++cp4 用决策树算法预测森林植被
//4.5 Covtype 数据集
https://archive.ics.uci.edu/ml/machine-learning-databases/covtype
//关闭hadoop安全模式
/usr/local/hadoop/bin/hadoop dfsadmin -safemode leave
hadoop fs -put covtype.data /user/zieox/
//读取数据
val dataWithoutHeader = spark.read.option("inferSchema",true).option("header",false).csv("hdfs:///user/zieox/covtype.data")
//添加列名
val colNames = Seq("Elevation","Aspect","Slope","Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology","Horizontal_Distance_To_Roadways","Hillshade_9am","Hillshade_Noon","Hillshade_3pm","Horizontal_Distance_To_Fire_Points")++ ((0 until 4).map(i=>s"Wilderness_Area_$i"))++ ((0 until 40).map(i=>s"Soil_Typr_$i"))++ Seq("Cover_Type")
val data = dataWithoutHeader.toDF(colNames:_*).withColumn("Cover_Type", $"Cover_Type".cast("double"))
//4.7 第一颗决策树
val Array(trainData,testData) = data.randomSplit(Array(0.9,0.1))
trainData.cache()
testData.cache()
import org.apache.spark.ml.feature.VectorAssembler
val inputCols = trainData.columns.filter(_ != "Cover_Type")
val assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("featureVector")
val assembledTrainData = assembler.transform(trainData)
assembledTrainData.select("featureVector").show(truncate=false)
import scala.util.Random
import org.apache.spark.ml.classification.{DecisionTreeClassifier,RandomForestClassifier, RandomForestClassificationModel}
val classifier = new DecisionTreeClassifier().
setSeed(Random.nextLong()).
setLabelCol("Cover_Type").
setFeaturesCol("featureVector").
setPredictionCol("prediction")
val model = classifier.fit(assembledTrainData)
println(model.toDebugString)
model.featureImportances.toArray.zip(inputCols).sorted.reverse.foreach(println)
val predictions = model.transform(assembledTrainData)
predictions.select("Cover_Type","prediction","probability").show(truncate = false)
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
val evaluator = new MulticlassClassificationEvaluator().setLabelCol("Cover_Type").setPredictionCol("prediction")
evaluator.setMetricName("accuracy").evaluate(predictions)
evaluator.setMetricName("f1").evaluate(predictions)
import org.apache.spark.mllib.evaluation.MulticlassMetrics
val predictionRDD = predictions.select("prediction", "Cover_Type").as[(Double,Double)].rdd
val multiclassMetrics = new MulticlassMetrics(predictionRDD)
println(multiclassMetrics.confusionMatrix)
val confusionMatrix = predictions.groupBy("Cover_Type").pivot("prediction", (1 to 7)).count().na.fill(0.0).orderBy("Cover_Type")
confusionMatrix.show()
import org.apache.spark.sql.DataFrame
def classProbabilities(data: DataFrame): Array[Double] = {
val total = data.count()
data.groupBy("Cover_Type").count().orderBy("Cover_Type").select("count").as[Double].map(_ / total).collect()
}
//虚拟机磁盘加载: 设置-存储-添加虚拟硬盘-新建分区- (磁盘 - new(格式化分区-格式化
//4.8 决策树的超参数
//4.9 决策树调优
import org.apache.spark.ml.Pipeline
val inputCols = trainData.columns.filter(_ ! = "Cover_Type")
val assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("featureVector")
val classifier = new DecisionTreeClassifier().
setSeed(Random.nextLong()).
setLabelCol("Cover_type").
setFeaturesCol("featureVector").setPredictionCol("prediction")
val pipeline = new Pipeline().setStages(Array(assembler,classifier))
import org.apache.spark.ml.tuning.ParamGridBuilder
val paramGrid = new ParamGridBuilder().
addGrid(classifier.impurity,Seq("gini","entropy")).
addGrid(classifier.maxDepth,Seq(1,20)).
addGrid(classifier.maxBins,Seq(40,300)).
addGrid(classifier.minInfoGain,Seq(0.0,0.05)).
build()
val multiclassEval = new MulticlassClassificationEvaluator().setLabelCol("Cover_Type").setPredictionCol("prediction").setMetricName("accuracy")
import org.apache.spark.ml.tuning.TrainValidationSplit
val validator = new TrainValidationSplit().
setSeed(Random.nextLong()).
setEstimator(pipeline).
setEvaluator(multiclassEval).
setEstimatorParamMaps(paramGrid).
setTrainRatio(0.9)
val validatorModel = validator.fit(trainData)
//手动从 PipelineModel中提取DecisionTreeClassificatonModel的实例
import org.apache.spark.ml.PipelineModel
val bestModel = validatorModel.bestModel
//取得超参数的评估结果
val validatorModel = validator.fit(trainData)
val paramAndMetrics = validatorModel.validationMetrics.zip(validatorModel.getEstimatorParamMaps).sortBy(-_._1)
paramAndMetrics.foreach{ case (metric,params) =>
println(metric)
println(params)
println()
}
//这个模型在交叉验证中的准确率
validatorModel.validationMetrics.max
multiclassEval.evaluate(bestModel.transform(testData))
import org.apache.spark.sql.functions._
def unencodeOneHot(data: DataFrame): DataFrame = {
val wildernessCols = (0 until 4).map(i => s"Wilderness_Area_$i").toArray
val wildernessAssembler = new VectorAssembler().setInputCols(wildernessCols).setOutputCol("wilderness")
val unhotUDF = udf((vec: Vector) => vec.toArray.indexOf(1.0).toDouble)
val withWilderness = wildernessAssembler.transform(data).drop(wildernessCols:_*).withColumn("wilderness", unhotUDF($"wilderness"))
val soilCols = (0 until 40).map(i => s"Soil_Type_$i").toArray
val soilAssembler = new VectorAssembler().setInputCols(soilCols).setOutputCol("soil")
soilAssembler.transform(withWilderness).drop(soilCols:_*).withColumn("soil", unhotUDF($"soil"))
}
import org.apache.spark.ml.feature.VectorIndexer
val inputCols = unencTrainData.columns.filter(_ != "Cover_Type")
val assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("featureVector")
val indexer = new VectorIndexer().setMaxCategories(40).setInputCol("featureVector").setOutputCol("indexedVector")
val classifier = new DecisionTreeClassifier().setSeed(Random.nextLong()).setLabelCol("Cover_Type").setFeaturesCol("indexedVector").setPredictionCol("prediction")
val pipeline = new Pipeline().setStages(Array(assembler, indexer, classifier))
//4.11 随机决策森林
val classifier = new RandomForestClassifier().
setSeed(Random.nextLong()).
setLabelCol("Cover_Type").
setFeaturesCol("indexedVector").
setPredictionCol("prediction")
val forestModel = bestModel.asInstanceOf[PipelineModel].stages.last.asInstanceOf[RandomForestClassificationModel]
forestModel.featureImportances.toArray.zip(inputCols).sorted.reverse.foreach(println)
//4.12 进行预测
bestModel.transform(unencTestData.drop("Cover_Type")).select("prediction").show()
-----------------------------------------------------------------------------------------------------------------------
package com.cloudera.datascience.rdf
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{DecisionTreeClassifier, RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import scala.util.Random
object RunRDF {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("RunRDF").setMaster("local[*]")
val sc=new SparkContext(conf)
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val dataWithoutHeader = spark.read.option("inferSchema", true).option("header", false).csv("hdfs://master:9000/user/zieox/covtype.data")
val colNames = Seq("Elevation", "Aspect", "Slope","Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways","Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm","Horizontal_Distance_To_Fire_Points")++ ((0 until 4).map(i => s"Wilderness_Area_$i"))++ ((0 until 40).map(i => s"Soil_Type_$i"))++ Seq("Cover_Type")
val data = dataWithoutHeader.toDF(colNames:_*).withColumn("Cover_Type", $"Cover_Type".cast("double"))
data.show()
data.head
// Split into 90% train (+ CV), 10% test
val Array(trainData, testData) = data.randomSplit(Array(0.9, 0.1))
trainData.cache()
testData.cache()
val runRDF = new RunRDF(spark)
runRDF.simpleDecisionTree(trainData, testData)
runRDF.randomClassifier(trainData, testData)
runRDF.evaluate(trainData, testData)
runRDF.evaluateCategorical(trainData, testData)
runRDF.evaluateForest(trainData, testData)
trainData.unpersist()
testData.unpersist()
}
}
class RunRDF(private val spark: SparkSession) {
import spark.implicits._
def simpleDecisionTree(trainData: DataFrame, testData: DataFrame): Unit = {
val inputCols = trainData.columns.filter(_ != "Cover_Type")
val assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("featureVector")
val assembledTrainData = assembler.transform(trainData)
assembledTrainData.select("featureVector").show(truncate = false)
val classifier = new DecisionTreeClassifier().
setSeed(Random.nextLong()).
setLabelCol("Cover_Type").
setFeaturesCol("featureVector").
setPredictionCol("prediction")
val model = classifier.fit(assembledTrainData)
println(model.toDebugString)
model.featureImportances.toArray.zip(inputCols).sorted.reverse.foreach(println)
val predictions = model.transform(assembledTrainData)
predictions.select("Cover_Type", "prediction", "probability").show(truncate = false)
val evaluator = new MulticlassClassificationEvaluator().setLabelCol("Cover_Type").setPredictionCol("prediction")
val accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
val f1 = evaluator.setMetricName("f1").evaluate(predictions)
println(accuracy)
println(f1)
val predictionRDD = predictions.select("prediction", "Cover_Type").as[(Double,Double)].rdd
val multiclassMetrics = new MulticlassMetrics(predictionRDD)
println(multiclassMetrics.confusionMatrix)
val confusionMatrix = predictions.groupBy("Cover_Type").pivot("prediction", (1 to 7)).count().na.fill(0.0).orderBy("Cover_Type")
confusionMatrix.show()
}
def classProbabilities(data: DataFrame): Array[Double] = {
val total = data.count()
data.groupBy("Cover_Type").count().orderBy("Cover_Type").select("count").as[Double].map(_ / total).collect()
}
def randomClassifier(trainData: DataFrame, testData: DataFrame): Unit = {
val trainPriorProbabilities = classProbabilities(trainData)
val testPriorProbabilities = classProbabilities(testData)
val accuracy = trainPriorProbabilities.zip(testPriorProbabilities).map {case (trainProb, cvProb) => trainProb * cvProb}.sum
println(accuracy)
}
def evaluate(trainData: DataFrame, testData: DataFrame): Unit = {
val inputCols = trainData.columns.filter(_ != "Cover_Type")
val assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("featureVector")
val classifier = new DecisionTreeClassifier().
setSeed(Random.nextLong()).
setLabelCol("Cover_Type").
setFeaturesCol("featureVector").
setPredictionCol("prediction")
val pipeline = new Pipeline().setStages(Array(assembler, classifier))
val paramGrid = new ParamGridBuilder().
addGrid(classifier.impurity, Seq("gini", "entropy")).
addGrid(classifier.maxDepth, Seq(1, 20)).
addGrid(classifier.maxBins, Seq(40, 300)).
addGrid(classifier.minInfoGain, Seq(0.0, 0.05)).
build()
val multiclassEval = new MulticlassClassificationEvaluator().
setLabelCol("Cover_Type").
setPredictionCol("prediction").
setMetricName("accuracy")
val validator = new TrainValidationSplit().
setSeed(Random.nextLong()).
setEstimator(pipeline).
setEvaluator(multiclassEval).
setEstimatorParamMaps(paramGrid).
setTrainRatio(0.9)
val validatorModel = validator.fit(trainData)
val paramsAndMetrics = validatorModel.validationMetrics.zip(validatorModel.getEstimatorParamMaps).sortBy(-_._1)
paramsAndMetrics.foreach { case (metric, params) =>
println(metric)
println(params)
println()
}
val bestModel = validatorModel.bestModel
println(bestModel.asInstanceOf[PipelineModel].stages.last.extractParamMap)
println(validatorModel.validationMetrics.max)
val testAccuracy = multiclassEval.evaluate(bestModel.transform(testData))
println(testAccuracy)
val trainAccuracy = multiclassEval.evaluate(bestModel.transform(trainData))
println(trainAccuracy)
}
def unencodeOneHot(data: DataFrame): DataFrame = {
val wildernessCols = (0 until 4).map(i => s"Wilderness_Area_$i").toArray
val wildernessAssembler = new VectorAssembler().setInputCols(wildernessCols).setOutputCol("wilderness")
val unhotUDF = udf((vec: Vector) => vec.toArray.indexOf(1.0).toDouble)
val withWilderness = wildernessAssembler.transform(data).drop(wildernessCols:_*).withColumn("wilderness", unhotUDF($"wilderness"))
val soilCols = (0 until 40).map(i => s"Soil_Type_$i").toArray
val soilAssembler = new VectorAssembler().setInputCols(soilCols).setOutputCol("soil")
soilAssembler.transform(withWilderness).drop(soilCols:_*).withColumn("soil", unhotUDF($"soil"))
}
def evaluateCategorical(trainData: DataFrame, testData: DataFrame): Unit = {
val unencTrainData = unencodeOneHot(trainData)
val unencTestData = unencodeOneHot(testData)
val inputCols = unencTrainData.columns.filter(_ != "Cover_Type")
val assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("featureVector")
val indexer = new VectorIndexer().setMaxCategories(40).setInputCol("featureVector").setOutputCol("indexedVector")
val classifier = new DecisionTreeClassifier().
setSeed(Random.nextLong()).
setLabelCol("Cover_Type").
setFeaturesCol("indexedVector").
setPredictionCol("prediction")
val pipeline = new Pipeline().setStages(Array(assembler, indexer, classifier))
val paramGrid = new ParamGridBuilder().
addGrid(classifier.impurity, Seq("gini", "entropy")).
addGrid(classifier.maxDepth, Seq(1, 20)).
addGrid(classifier.maxBins, Seq(40, 300)).
addGrid(classifier.minInfoGain, Seq(0.0, 0.05)).
build()
val multiclassEval = new MulticlassClassificationEvaluator().setLabelCol("Cover_Type").setPredictionCol("prediction").setMetricName("accuracy")
val validator = new TrainValidationSplit().
setSeed(Random.nextLong()).
setEstimator(pipeline).
setEvaluator(multiclassEval).
setEstimatorParamMaps(paramGrid).
setTrainRatio(0.9)
val validatorModel = validator.fit(unencTrainData)
val bestModel = validatorModel.bestModel
println(bestModel.asInstanceOf[PipelineModel].stages.last.extractParamMap)
val testAccuracy = multiclassEval.evaluate(bestModel.transform(unencTestData))
println(testAccuracy)
}
def evaluateForest(trainData: DataFrame, testData: DataFrame): Unit = {
val unencTrainData = unencodeOneHot(trainData)
val unencTestData = unencodeOneHot(testData)
val inputCols = unencTrainData.columns.filter(_ != "Cover_Type")
val assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("featureVector")
val indexer = new VectorIndexer().setMaxCategories(40).setInputCol("featureVector").setOutputCol("indexedVector")
val classifier = new RandomForestClassifier().
setSeed(Random.nextLong()).
setLabelCol("Cover_Type").
setFeaturesCol("indexedVector").
setPredictionCol("prediction").
setImpurity("entropy").
setMaxDepth(20).
setMaxBins(300)
val pipeline = new Pipeline().setStages(Array(assembler, indexer, classifier))
val paramGrid = new ParamGridBuilder().
addGrid(classifier.minInfoGain, Seq(0.0, 0.05)).
addGrid(classifier.numTrees, Seq(1, 10)).
build()
val multiclassEval = new MulticlassClassificationEvaluator().setLabelCol("Cover_Type").setPredictionCol("prediction").setMetricName("accuracy")
val validator = new TrainValidationSplit().
setSeed(Random.nextLong()).
setEstimator(pipeline).
setEvaluator(multiclassEval).
setEstimatorParamMaps(paramGrid).
setTrainRatio(0.9)
val validatorModel = validator.fit(unencTrainData)
val bestModel = validatorModel.bestModel
val forestModel = bestModel.asInstanceOf[PipelineModel].stages.last.asInstanceOf[RandomForestClassificationModel]
println(forestModel.extractParamMap)
println(forestModel.getNumTrees)
forestModel.featureImportances.toArray.zip(inputCols).sorted.reverse.foreach(println)
val testAccuracy = multiclassEval.evaluate(bestModel.transform(unencTestData))
println(testAccuracy)
bestModel.transform(unencTestData.drop("Cover_Type")).select("prediction").show()
}
}
-----------------------------------------------------------------------------------------------------------------------
https://blog.youkuaiyun.com/cc1949/article/details/80604379
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++cp5 基于K均值聚类的网络流量异常检测
//5.1 异常检测
//5.2 K均值聚类
//5.3 网络入侵
//5.4 KDD Cup 1999数据集
http://www.kdd.org/kdd-cup/view/kdd-cup-1999/Tasks
http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html
//获取kddcup.data.gz
cd /media/zieox/zieox
mkdir sparkS2
wget http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz
gunzip kddcup.data.gz
/usr/local/hadoop/bin/hadoop dfsadmin -safemode leave
hadoop fs -put kddcup.data /user/zieox/
val path =/user/zieox/kddcup.data
//5.5 初步尝试聚类
//read_data
http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names
val data = spark.read.
option("inferSchema", true).
option("header", false).
csv("hdfs://master:9000/user/zieox/kddcup.data").
toDF(
"duration", "protocol_type", "service", "flag",
"src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent",
"hot", "num_failed_logins", "logged_in", "num_compromised",
"root_shell", "su_attempted", "num_root", "num_file_creations",
"num_shells", "num_access_files", "num_outbound_cmds",
"is_host_login", "is_guest_login", "count", "srv_count",
"serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
"same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
"dst_host_count", "dst_host_srv_count",
"dst_host_same_srv_rate", "dst_host_diff_srv_rate",
"dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
"dst_host_serror_rate", "dst_host_srv_serror_rate",
"dst_host_rerror_rate", "dst_host_srv_rerror_rate",
"label")
data.select("label").groupBy("label").count().orderBy($"count".desc).show(25)
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.clustering.{KMeans, KMeansModel}
import org.apache.spark.ml.feature.{OneHotEncoder, VectorAssembler, StringIndexer, StandardScaler}
val numericOnly = data.drop("protocol_type", "service", "flag").cache()
val assembler = new VectorAssembler().setInputCols(numericOnly.columns.filter(_ != "label")).setOutputCol("featureVector")
val kmeans = new KMeans().setSeed(Random.nextLong()).setPredictionCol("cluster").setFeaturesCol("featureVector")
val pipeline = new Pipeline().setStages(Array(assembler, kmeans))
val pipelineModel = pipeline.fit(numericOnly)
val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
kmeansModel.clusterCenters.foreach(println)
val withCluster = pipelineModel.transform(numericOnly)
withCluster.select("cluster", "label").groupBy("cluster", "label").count().orderBy($"cluster", $"count".desc).show(25)
//5.6 K的选择
import org.apache.spark.sql.DataFrame
def clusteringScore0(data: DataFrame, k: Int): Double = {
val assembler = new VectorAssembler().setInputCols(data.columns.filter(_ != "label")).setOutputCol("featureVector")
val kmeans = new KMeans().setSeed(Random.nextLong()).setK(k).setPredictionCol("cluster").setFeaturesCol("featureVector")
val pipeline = new Pipeline().setStages(Array(assembler, kmeans))
val kmeansModel = pipeline.fit(data).stages.last.asInstanceOf[KMeansModel]
kmeansModel.computeCost(assembler.transform(data)) / data.count()
}
(20 to 100 by 20).map(k=>(k,clusteringScore0(numericOnly,k))).foreach(println)
//5.7 基于SparkR的可视化
sudo apt-get install r-base
//5.8 特征的规范化
// Clustering, Take 2
import org.apache.spark.ml.feature.StandardScaler
def clusteringScore2(data: DataFrame, k: Int): Double = {
val assembler = new VectorAssembler().setInputCols(data.columns.filter(_ != "label")).setOutputCol("featureVector")
val scaler = new StandardScaler().setInputCol("featureVector").setOutputCol("scaledFeatureVector").setWithStd(true).setWithMean(false)
val kmeans = new KMeans().setSeed(Random.nextLong()).setK(k).setPredictionCol("cluster").setFeaturesCol("scaledFeatureVector").setMaxIter(40).setTol(1.0e-5)
val pipeline = new Pipeline().setStages(Array(assembler, scaler, kmeans))
val pipelineModel = pipeline.fit(data)
val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
kmeansModel.computeCost(pipelineModel.transform(data)) / data.count()
}
(60 to 270 by 30).map(k=>(k,clusteringScore2(numericOnly,k))).foreach(println)
//5.9 类别型变量
import org.apache.spark.ml.feature.{OneHotEncoder, VectorAssembler, StringIndexer, StandardScaler}
// Clustering, Take 3
def oneHotPipeline(inputCol: String): (Pipeline, String) = {
val indexer = new StringIndexer().setInputCol(inputCol).setOutputCol(inputCol + "_indexed")
val encoder = new OneHotEncoder().setInputCol(inputCol + "_indexed").setOutputCol(inputCol + "_vec")
val pipeline = new Pipeline().setStages(Array(indexer, encoder))
(pipeline, inputCol + "_vec")
}
//5.10 利用标号的熵信息
// Clustering, Take 4
def entropy(counts: Iterable[Int]): Double = {
val values = counts.filter(_ > 0)
val n = values.map(_.toDouble).sum
values.map { v => val p = v / n -p * math.log(p)}.sum
}
def clusteringScore4(data: DataFrame, k: Int): Double = {
val pipelineModel = fitPipeline4(data, k) // Predict cluster for each datum
val clusterLabel = pipelineModel.transform(data).select("cluster", "label").as[(Int, String)]
val weightedClusterEntropy = clusterLabel.
// Extract collections of labels, per cluster
groupByKey { case (cluster, _) => cluster }.
mapGroups { case (_, clusterLabels) =>
val labels = clusterLabels.map { case (_, label) => label }.toSeq
// Count labels in collections
val labelCounts = labels.groupBy(identity).values.map(_.size)
labels.size * entropy(labelCounts)
}.collect()
// Average entropy weighted by cluster size
weightedClusterEntropy.sum / data.count()
def fitPipeline4(data: DataFrame, k: Int): PipelineModel = {
val (protoTypeEncoder, protoTypeVecCol) = oneHotPipeline("protocol_type")
val (serviceEncoder, serviceVecCol) = oneHotPipeline("service")
val (flagEncoder, flagVecCol) = oneHotPipeline("flag")
// Original columns, without label / string columns, but with new vector encoded cols
val assembleCols = Set(data.columns: _*) --
Seq("label", "protocol_type", "service", "flag") ++
Seq(protoTypeVecCol, serviceVecCol, flagVecCol)
val assembler = new VectorAssembler().
setInputCols(assembleCols.toArray).
setOutputCol("featureVector")
val scaler = new StandardScaler()
.setInputCol("featureVector")
.setOutputCol("scaledFeatureVector")
.setWithStd(true)
.setWithMean(false)
val kmeans = new KMeans().
setSeed(Random.nextLong()).
setK(k).
setPredictionCol("cluster").
setFeaturesCol("scaledFeatureVector").
setMaxIter(40).
setTol(1.0e-5)
val pipeline = new Pipeline().setStages(
Array(protoTypeEncoder, serviceEncoder, flagEncoder, assembler, scaler, kmeans))
pipeline.fit(data)
}
//5.11 聚类实战
def clusteringTake4(data: DataFrame): Unit = {
(60 to 270 by 30).map(k => (k, clusteringScore4(data, k))).foreach(println)
val pipelineModel = fitPipeline4(data, 180)
val countByClusterLabel = pipelineModel.transform(data).
select("cluster", "label").
groupBy("cluster", "label").count().
orderBy("cluster", "label")
countByClusterLabel.show()
}
import org.apache.spark.ml.linalg.{Vector, Vectors}
val kMeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
val centroids = kMeansModel.clusterCenters
val clustered = pipelineModel.transform(data)
val threshold = clustered.
select("cluster", "scaledFeatureVector").as[(Int, Vector)].
map { case (cluster, vec) => Vectors.sqdist(centroids(cluster), vec) }.
orderBy($"value".desc).take(100).last
val originalCols = data.columns
val anomalies = clustered.filter { row =>
val cluster = row.getAs[Int]("cluster")
val vec = row.getAs[Vector]("scaledFeatureVector")
Vectors.sqdist(centroids(cluster), vec) >= threshold
}.select(originalCols.head, originalCols.tail:_*)
-----------------------------------------------------------------------------------------------------------------------
package com.cloudera.datascience.kmeans
import org.apache.spark.ml.{PipelineModel, Pipeline}
import org.apache.spark.ml.clustering.{KMeans, KMeansModel}
import org.apache.spark.ml.feature.{OneHotEncoder, VectorAssembler, StringIndexer, StandardScaler}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.util.Random
object RunKMeans {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().getOrCreate()
val data = spark.read.
option("inferSchema", true).
option("header", false).
csv("hdfs:///user/ds/kddcup.data").
toDF(
"duration", "protocol_type", "service", "flag",
"src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent",
"hot", "num_failed_logins", "logged_in", "num_compromised",
"root_shell", "su_attempted", "num_root", "num_file_creations",
"num_shells", "num_access_files", "num_outbound_cmds",
"is_host_login", "is_guest_login", "count", "srv_count",
"serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
"same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
"dst_host_count", "dst_host_srv_count",
"dst_host_same_srv_rate", "dst_host_diff_srv_rate",
"dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
"dst_host_serror_rate", "dst_host_srv_serror_rate",
"dst_host_rerror_rate", "dst_host_srv_rerror_rate",
"label")
data.cache()
val runKMeans = new RunKMeans(spark)
runKMeans.clusteringTake0(data)
runKMeans.clusteringTake1(data)
runKMeans.clusteringTake2(data)
runKMeans.clusteringTake3(data)
runKMeans.clusteringTake4(data)
runKMeans.buildAnomalyDetector(data)
data.unpersist()
}
}
class RunKMeans(private val spark: SparkSession) {
import spark.implicits._
// Clustering, Take 0
def clusteringTake0(data: DataFrame): Unit = {
data.select("label").groupBy("label").count().orderBy($"count".desc).show(25)
val numericOnly = data.drop("protocol_type", "service", "flag").cache()
val assembler = new VectorAssembler().
setInputCols(numericOnly.columns.filter(_ != "label")).
setOutputCol("featureVector")
val kmeans = new KMeans().
setSeed(Random.nextLong()).
setPredictionCol("cluster").
setFeaturesCol("featureVector")
val pipeline = new Pipeline().setStages(Array(assembler, kmeans))
val pipelineModel = pipeline.fit(numericOnly)
val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
kmeansModel.clusterCenters.foreach(println)
val withCluster = pipelineModel.transform(numericOnly)
withCluster.select("cluster", "label").groupBy("cluster", "label").count().orderBy($"cluster", $"count".desc).show(25)
numericOnly.unpersist()
}
// Clustering, Take 1
def clusteringScore0(data: DataFrame, k: Int): Double = {
val assembler = new VectorAssembler().
setInputCols(data.columns.filter(_ != "label")).
setOutputCol("featureVector")
val kmeans = new KMeans().
setSeed(Random.nextLong()).
setK(k).
setPredictionCol("cluster").
setFeaturesCol("featureVector")
val pipeline = new Pipeline().setStages(Array(assembler, kmeans))
val kmeansModel = pipeline.fit(data).stages.last.asInstanceOf[KMeansModel]
kmeansModel.computeCost(assembler.transform(data)) / data.count()
}
def clusteringScore1(data: DataFrame, k: Int): Double = {
val assembler = new VectorAssembler().
setInputCols(data.columns.filter(_ != "label")).
setOutputCol("featureVector")
val kmeans = new KMeans().
setSeed(Random.nextLong()).
setK(k).
setPredictionCol("cluster").
setFeaturesCol("featureVector").
setMaxIter(40).
setTol(1.0e-5)
val pipeline = new Pipeline().setStages(Array(assembler, kmeans))
val kmeansModel = pipeline.fit(data).stages.last.asInstanceOf[KMeansModel]
kmeansModel.computeCost(assembler.transform(data)) / data.count()
}
def clusteringTake1(data: DataFrame): Unit = {
val numericOnly = data.drop("protocol_type", "service", "flag").cache()
(20 to 100 by 20).map(k => (k, clusteringScore0(numericOnly, k))).foreach(println)
(20 to 100 by 20).map(k => (k, clusteringScore1(numericOnly, k))).foreach(println)
numericOnly.unpersist()
}
// Clustering, Take 2
def clusteringScore2(data: DataFrame, k: Int): Double = {
val assembler = new VectorAssembler().
setInputCols(data.columns.filter(_ != "label")).
setOutputCol("featureVector")
val scaler = new StandardScaler()
.setInputCol("featureVector")
.setOutputCol("scaledFeatureVector")
.setWithStd(true)
.setWithMean(false)
val kmeans = new KMeans().
setSeed(Random.nextLong()).
setK(k).
setPredictionCol("cluster").
setFeaturesCol("scaledFeatureVector").
setMaxIter(40).
setTol(1.0e-5)
val pipeline = new Pipeline().setStages(Array(assembler, scaler, kmeans))
val pipelineModel = pipeline.fit(data)
val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
kmeansModel.computeCost(pipelineModel.transform(data)) / data.count()
}
def clusteringTake2(data: DataFrame): Unit = {
val numericOnly = data.drop("protocol_type", "service", "flag").cache()
(60 to 270 by 30).map(k => (k, clusteringScore2(numericOnly, k))).foreach(println)
numericOnly.unpersist()
}
// Clustering, Take 3
def oneHotPipeline(inputCol: String): (Pipeline, String) = {
val indexer = new StringIndexer().
setInputCol(inputCol).
setOutputCol(inputCol + "_indexed")
val encoder = new OneHotEncoder().
setInputCol(inputCol + "_indexed").
setOutputCol(inputCol + "_vec")
val pipeline = new Pipeline().setStages(Array(indexer, encoder))
(pipeline, inputCol + "_vec")
}
def clusteringScore3(data: DataFrame, k: Int): Double = {
val (protoTypeEncoder, protoTypeVecCol) = oneHotPipeline("protocol_type")
val (serviceEncoder, serviceVecCol) = oneHotPipeline("service")
val (flagEncoder, flagVecCol) = oneHotPipeline("flag")
// Original columns, without label / string columns, but with new vector encoded cols
val assembleCols = Set(data.columns: _*) --
Seq("label", "protocol_type", "service", "flag") ++
Seq(protoTypeVecCol, serviceVecCol, flagVecCol)
val assembler = new VectorAssembler().
setInputCols(assembleCols.toArray).
setOutputCol("featureVector")
val scaler = new StandardScaler()
.setInputCol("featureVector")
.setOutputCol("scaledFeatureVector")
.setWithStd(true)
.setWithMean(false)
val kmeans = new KMeans().
setSeed(Random.nextLong()).
setK(k).
setPredictionCol("cluster").
setFeaturesCol("scaledFeatureVector").
setMaxIter(40).
setTol(1.0e-5)
val pipeline = new Pipeline().setStages(
Array(protoTypeEncoder, serviceEncoder, flagEncoder, assembler, scaler, kmeans))
val pipelineModel = pipeline.fit(data)
val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
kmeansModel.computeCost(pipelineModel.transform(data)) / data.count()
}
def clusteringTake3(data: DataFrame): Unit = {
(60 to 270 by 30).map(k => (k, clusteringScore3(data, k))).foreach(println)
}
// Clustering, Take 4
def entropy(counts: Iterable[Int]): Double = {
val values = counts.filter(_ > 0)
val n = values.map(_.toDouble).sum
values.map { v =>
val p = v / n -p * math.log(p)
}.sum
}
def fitPipeline4(data: DataFrame, k: Int): PipelineModel = {
val (protoTypeEncoder, protoTypeVecCol) = oneHotPipeline("protocol_type")
val (serviceEncoder, serviceVecCol) = oneHotPipeline("service")
val (flagEncoder, flagVecCol) = oneHotPipeline("flag")
// Original columns, without label / string columns, but with new vector encoded cols
val assembleCols = Set(data.columns: _*) --
Seq("label", "protocol_type", "service", "flag") ++
Seq(protoTypeVecCol, serviceVecCol, flagVecCol)
val assembler = new VectorAssembler().
setInputCols(assembleCols.toArray).
setOutputCol("featureVector")
val scaler = new StandardScaler()
.setInputCol("featureVector")
.setOutputCol("scaledFeatureVector")
.setWithStd(true)
.setWithMean(false)
val kmeans = new KMeans().
setSeed(Random.nextLong()).
setK(k).
setPredictionCol("cluster").
setFeaturesCol("scaledFeatureVector").
setMaxIter(40).
setTol(1.0e-5)
val pipeline = new Pipeline().setStages(
Array(protoTypeEncoder, serviceEncoder, flagEncoder, assembler, scaler, kmeans))
pipeline.fit(data)
}
def clusteringScore4(data: DataFrame, k: Int): Double = {
val pipelineModel = fitPipeline4(data, k)
// Predict cluster for each datum
val clusterLabel = pipelineModel.transform(data).select("cluster", "label").as[(Int, String)]
val weightedClusterEntropy = clusterLabel.
// Extract collections of labels, per cluster
groupByKey { case (cluster, _) => cluster }.
mapGroups { case (_, clusterLabels) =>
val labels = clusterLabels.map { case (_, label) => label }.toSeq
// Count labels in collections
val labelCounts = labels.groupBy(identity).values.map(_.size)
labels.size * entropy(labelCounts)
}.collect()
// Average entropy weighted by cluster size
weightedClusterEntropy.sum / data.count()
}
def clusteringTake4(data: DataFrame): Unit = {
(60 to 270 by 30).map(k => (k, clusteringScore4(data, k))).foreach(println)
val pipelineModel = fitPipeline4(data, 180)
val countByClusterLabel = pipelineModel.transform(data).
select("cluster", "label").
groupBy("cluster", "label").count().
orderBy("cluster", "label")
countByClusterLabel.show()
}
// Detect anomalies
def buildAnomalyDetector(data: DataFrame): Unit = {
val pipelineModel = fitPipeline4(data, 180)
val kMeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
val centroids = kMeansModel.clusterCenters
val clustered = pipelineModel.transform(data)
val threshold = clustered.
select("cluster", "scaledFeatureVector").as[(Int, Vector)].
map { case (cluster, vec) => Vectors.sqdist(centroids(cluster), vec) }.
orderBy($"value".desc).take(100).last
val originalCols = data.columns
val anomalies = clustered.filter { row =>
val cluster = row.getAs[Int]("cluster")
val vec = row.getAs[Vector]("scaledFeatureVector")
Vectors.sqdist(centroids(cluster), vec) >= threshold
}.select(originalCols.head, originalCols.tail:_*)
println(anomalies.first())
}
}
-----------------------------------------------------------------------------------------------------------------------