本文将UCI中breast_cancer数据集转为spark_df进行演示,讲解了利用pyspark进行建模及调参的流程和方法。
from sklearn import datasets
import pandas as pd
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import pyspark.ml.tuning as tune
from pyspark.ml import Transformer,Pipeline
from pyspark.ml.classification import LogisticRegression,RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
cancer=datasets.load_breast_cancer()
X,y=cancer.data,cancer.target
columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13','f14','f15','f16','f17','f18','f19','f20','f21','f22','f23','f24','f25','f26','f27','f28','f29','f30']
df=pd.concat([pd.DataFrame(X,columns= columns),pd.DataFrame(y,columns=['label'])],axis=1)
spark_df = spark.createDataFrame(df.values.tolist(), df.columns.tolist())
预处理及划分数据集
vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")
pipeline = Pipeline(stages=[vecAssembler])
pipelineFit = pipeline.fit(spark_df)
dataset = pipelineFit.transform(spark_df)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], 123)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
>>Training Dataset Count: 381
>>Test Dataset Count: 188
逻辑回归默认参数训练及预测
# 模型训练
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
# 模型预测
prediction = lrModel.transform(testData)
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability')
print('areaUnderROC:', evaluator.evaluate(prediction, {evaluator.metricName:'areaUnderROC'}))
print('areaUnderPR:', evaluator.evaluate(prediction, {evaluator.metricName: 'areaUnderPR'}))
# 计算准确率
print('accuracy:',prediction.filter(prediction.label == prediction.prediction).count()/prediction.count())
>>areaUnderROC: 0.9819204980842913
>>areaUnderPR: 0.9856307317265275
>>accuracy: 0.9308510638297872
逻辑回归模型调参
# Create ParamGrid for Cross Validation
grid = (ParamGridBuilder()
.addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
.addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
.build())
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability',metricName='areaUnderROC')
# Create 3-fold CrossValidator
cv = CrossValidator(estimator=lr,
estimatorParamMaps=grid,
evaluator=evaluator,
numFolds=3)
cvModel = cv.fit(trainingData)
输出最优参数
results = [
([
{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())
], metric) for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics)
]
sorted(results, key=lambda el:el[1], reverse=True)[0]
>>([{'regParam': 0.1}, {'elasticNetParam': 0.0}], 0.9953622902152315)
修改为最优参数进行预测
lr_new = LogisticRegression(maxIter=20, regParam=0.1, elasticNetParam=0)
lrModel_new = lr_new.fit(trainingData)
# 模型预测
prediction_new = lrModel_new.transform(testData)
print('areaUnderROC:', evaluator.evaluate(prediction_new, {evaluator.metricName:'areaUnderROC'}))
print('areaUnderPR:', evaluator.evaluate(prediction_new, {evaluator.metricName: 'areaUnderPR'}))
# 计算准确率
print('accuracy:',prediction_new.filter(prediction_new.label == prediction_new.prediction).count()/prediction_new.count())
>>areaUnderROC: 0.9855124521072797
>>areaUnderPR: 0.9877182758155769
>>accuracy: 0.9468085106382979
可以发现评估指标均有提高。
再尝试下随机森林
rf = RandomForestClassifier(numTrees=3, maxDepth=10, maxBins=30, labelCol="label", seed=123)
grid = (ParamGridBuilder().addGrid(rf.numTrees, [1, 3, 5])
.addGrid(rf.maxDepth, [3, 5, 7, 10])
.addGrid(rf.maxBins, [20, 30, 40])
.build())
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability',metricName='areaUnderROC')
cv = CrossValidator(estimator=rf,
evaluator=evaluator,
estimatorParamMaps=grid,
numFolds=3)
cvModel_rf = cv.fit(trainingData)
# 模型预测 ROC
predictions = cvModel_rf.transform(testData)
evaluator.evaluate(predictions)
输出最优参数
results = [
([
{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())
], metric) for params, metric in zip(cvModel_rf.getEstimatorParamMaps(), cvModel_rf.avgMetrics)
]
sorted(results, key=lambda el:el[1], reverse=True)[0]
>>([{'numTrees': 5}, {'maxDepth': 3}, {'maxBins': 40}], 0.9851099948526418)
该博客通过UCI的breast_cancer数据集,展示了如何将数据转换为Spark DataFrame,并使用pyspark进行逻辑回归和随机森林的建模与调参。首先,对数据进行预处理和划分,然后使用LogisticRegression进行默认参数训练,评估其性能。接着,通过CrossValidator进行参数网格搜索找到最优参数,重新训练并验证模型,提高了评估指标。最后,同样方式对随机森林模型进行调参,寻找最佳模型参数。
6791

被折叠的 条评论
为什么被折叠?



