算法小白的第一次尝试--LinearRegression

import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.storage.StorageLevel
/**
  * @author XiaoTangBao
  * @date 2019/3/4 10:47
  * @version 1.0
  */
object LR {
  def main(args: Array[String]): Unit = {
    //屏蔽不必要的日志显示
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    //创建sparkSession,Spark-2.X版本后采用sparkSession,sparkSession封装了sparkContext、sqlContext
    val sparkSession = SparkSession.builder().master("local[4]").appName("LR").getOrCreate()
    val sc = sparkSession.sparkContext
    //获取数据源
    val data = sc.parallelize(sc.textFile("G:\\mldata\\murder.txt").collect().map(str =>str.split(','))
        .map(arr =>(arr(0).toDouble,arr(1).toDouble,arr(2).toDouble,arr(3).toDouble,
        arr(4).toDouble,arr(5).toDouble,arr(6).toDouble,arr(7).toDouble)))
    //采取简单交叉验证
    val pdData = data.randomSplit(Array(0.7,0.3))
    val schema = StructType(List(StructField("Population",DoubleType,true),StructField("Income",DoubleType,true),
      StructField("Illiteracy",DoubleType,true),StructField("LifeExp",DoubleType,true),
      StructField("label",DoubleType,true),StructField("HSGrad",DoubleType,true),
      StructField("Frost",DoubleType,true),StructField("Area",DoubleType,true)))
    //创建features转换器
    val inputCols = Array("Population", "Income", "Illiteracy", "LifeExp", "HSGrad", "Frost", "Area")
    val assmebel = new VectorAssembler().setInputCols(inputCols).setOutputCol("features")
    //训练、测试数据整理并缓存至内存
    val trainData = sparkSession.sqlContext.createDataFrame(pdData(0).map(rdd=>Row(rdd._1,rdd._2,rdd._3,rdd._4,rdd._5,rdd._6,rdd._7,rdd._8)),schema).cache()
    val testData = sparkSession.sqlContext.createDataFrame(pdData(1).map(rdd=>Row(rdd._1,rdd._2,rdd._3,rdd._4,rdd._5,rdd._6,rdd._7,rdd._8)),schema).cache()
    //基于转换器进行数据转化
    val vecDF = assmebel.transform(trainData).select("label","features").persist(StorageLevel.MEMORY_ONLY)
    val vecDF2 = assmebel.transform(testData).select("label","features").persist(StorageLevel.MEMORY_ONLY)
    //训练模型
    val model = new LinearRegression()
      .setStandardization(true)//标准化
      .setMaxIter(10000)//迭代次数
      .setRegParam(0.3)//正则化系数
      .setElasticNetParam(0.8)//正则化混合范数
      .fit(vecDF)
    //获取训练模型的相关信息
    val trainingSummary = model.summary
    //模型残差
    trainingSummary.residuals.show()
    //模型均方差
    println("mse:" + trainingSummary.meanSquaredError)
    //模型均方根误差
    println("r-squared:" + trainingSummary.rootMeanSquaredError)
    //模型持久化
    model.write.overwrite().save("C:\\users\\Java_Man_China\\desktop\\model1")
    //加载模型
    val newModel = LinearRegressionModel.load("C:\\users\\Java_Man_China\\desktop\\model1")
    //预测
    val prediction = newModel.transform(vecDF2).selectExpr("label","prediction")
    prediction.show()
  }
}

-----------------------------------------spark 2.0 以前版本---------------------------------------
object LR2 {
  def main(args: Array[String]): Unit = {
    //屏蔽不必要的日志显示
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    val conf = new SparkConf().setMaster("local[4]").setAppName("LR")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    val data = sc.parallelize(sc.textFile("G:\\mldata\\Salary_Data.csv").collect().drop(1)).map(str =>str.split(',')).map(arr =>(arr(0).toDouble,arr(1).toDouble))
    val pdData = data.randomSplit(Array(0.7,0.3))
    val trainData = pdData(0).map(str =>LabeledPoint(str._2,Vectors.dense(str._1))).cache()
    val testData = pdData(1).map(str =>LabeledPoint(str._2,Vectors.dense(str._1))).persist()
    val numIterations = 5000
    val stepSize = 0.25
    val minBatchFraction = 1.0
    val model = LinearRegressionWithSGD.train(trainData,numIterations,stepSize,minBatchFraction)
    val prediction = model.predict(testData.map(Lab=>Lab.features))
    import sqlContext.implicits._
    val result = prediction.zip(testData.map(Lab =>Lab.label))
    val resultDF = result.toDF("prediction","real")
    resultDF.show()
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值