import sys
import time
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkConf,SparkContext
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.feature import StandardScaler
def extract_features(fleld,catedoriesMap,featureEnd):
categoryidx = catedoriesMap[fleld[3]]
categoryfeatures = np.zeros(len(catedoriesMap))
categoryfeatures[categoryidx] = 1
numericalFeatures=[convert_float(fleld) for fleld in fleld[4:featureEnd]]
return np.concatenate((categoryfeatures,numericalFeatures))
def extract_label(field):
label = field[-1]
return float(label)
def convert_float(x):
return (0 if x=="?" else float(x))
global Path
if sc.master[0:5]=='local':
Path='file:/home/swt/pythonwork/PythonProject/'
else:
Path="hdfs://localhost:9000/user/swt/"
# def prepare_data(sc):
print('load data...')
rawDataWithHeader = sc.textFile(Path+'data/train.tsv')
header = rawDataWithHeader.first()
rawData = rawDataWithHeader.filter(lambda x:x != header)
rData = rawData.map(lambda x:x.replace("\"",""))
lines = rData.map(lambda x:x.split("\t"))
print("is "+str(lines.count()
pyspark之LogisticRegression算法
最新推荐文章于 2025-04-21 22:38:22 发布
本文深入探讨了如何在PySpark中使用LogisticRegression算法进行数据分析和预测。通过实例,详细解释了模型构建、参数调整及结果评估的过程,为理解和应用Logistic Regression提供了实用指南。

最低0.47元/天 解锁文章
1466

被折叠的 条评论
为什么被折叠?



