import org.apache.log4j.{Level, Logger}import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}import org.apache.spark.sql.{Row, SparkSession}import org.apache.spark.sql.types.{DoubleType, StructField, StructType}import org.apache.spark.storage.StorageLevel
/**
* @author XiaoTangBao
* @date 2019/3/4 10:47
* @version 1.0
*/
object LR {
def main(args: Array[String]): Unit ={//屏蔽不必要的日志显示
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)//创建sparkSession,Spark-2.X版本后采用sparkSession,sparkSession封装了sparkContext、sqlContext
val sparkSession = SparkSession.builder().master("local[4]").appName("LR").getOrCreate()
val sc = sparkSession.sparkContext
//获取数据源
val data = sc.parallelize(sc.textFile("G:\\mldata\\murder.txt").collect().map(str =>str.split(',')).map(arr =>(arr(0).toDouble,arr(1).toDouble,arr(2).toDouble,arr(3).toDouble,arr(4).toDouble,arr(5).toDouble,arr(6).toDouble,arr(7).toDouble)))//采取简单交叉验证
val pdData = data.randomSplit(Array(0.7,0.3))
val schema =StructType(List(StructField("Population",DoubleType,true),StructField("Income",DoubleType,true),StructField("Illiteracy",DoubleType,true),StructField("LifeExp",DoubleType,true),StructField("label",DoubleType,true),StructField("HSGrad",DoubleType,true),StructField("Frost",DoubleType,true),StructField("Area",DoubleType,true)))//创建features转换器
val inputCols =Array("Population","Income","Illiteracy","LifeExp","HSGrad","Frost","Area")
val assmebel =newVectorAssembler().setInputCols(inputCols).setOutputCol("features")//训练、测试数据整理并缓存至内存
val trainData = sparkSession.sqlContext.createDataFrame(pdData(0).map(rdd=>Row(rdd._1,rdd._2,rdd._3,rdd._4,rdd._5,rdd._6,rdd._7,rdd._8)),schema).cache()
val testData = sparkSession.sqlContext.createDataFrame(pdData(1).map(rdd=>Row(rdd._1,rdd._2,rdd._3,rdd._4,rdd._5,rdd._6,rdd._7,rdd._8)),schema).cache()//基于转换器进行数据转化
val vecDF = assmebel.transform(trainData).select("label","features").persist(StorageLevel.MEMORY_ONLY)
val vecDF2 = assmebel.transform(testData).select("label","features").persist(StorageLevel.MEMORY_ONLY)//训练模型
val model =newLinearRegression().setStandardization(true)//标准化.setMaxIter(10000)//迭代次数.setRegParam(0.3)//正则化系数.setElasticNetParam(0.8)//正则化混合范数.fit(vecDF)//获取训练模型的相关信息
val trainingSummary = model.summary
//模型残差
trainingSummary.residuals.show()//模型均方差println("mse:"+ trainingSummary.meanSquaredError)//模型均方根误差println("r-squared:"+ trainingSummary.rootMeanSquaredError)//模型持久化
model.write.overwrite().save("C:\\users\\Java_Man_China\\desktop\\model1")//加载模型
val newModel = LinearRegressionModel.load("C:\\users\\Java_Man_China\\desktop\\model1")//预测
val prediction = newModel.transform(vecDF2).selectExpr("label","prediction")
prediction.show()}}-----------------------------------------spark 2.0 以前版本---------------------------------------
object LR2 {
def main(args: Array[String]): Unit ={//屏蔽不必要的日志显示
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
val conf =newSparkConf().setMaster("local[4]").setAppName("LR")
val sc =newSparkContext(conf)
val sqlContext =newSQLContext(sc)
val data = sc.parallelize(sc.textFile("G:\\mldata\\Salary_Data.csv").collect().drop(1)).map(str =>str.split(',')).map(arr =>(arr(0).toDouble,arr(1).toDouble))
val pdData = data.randomSplit(Array(0.7,0.3))
val trainData =pdData(0).map(str =>LabeledPoint(str._2,Vectors.dense(str._1))).cache()
val testData =pdData(1).map(str =>LabeledPoint(str._2,Vectors.dense(str._1))).persist()
val numIterations =5000
val stepSize =0.25
val minBatchFraction =1.0
val model = LinearRegressionWithSGD.train(trainData,numIterations,stepSize,minBatchFraction)
val prediction = model.predict(testData.map(Lab=>Lab.features))import sqlContext.implicits._
val result = prediction.zip(testData.map(Lab =>Lab.label))
val resultDF = result.toDF("prediction","real")
resultDF.show()}}