sc = SparkContext(appName="MySparkApplication")
lines = sc.textFile("/home/fei/sparkcode/data/")
data = lines.map(lambda x:x.split('\t'))
def dayToNum(data):
list1 = []
list1.append(data[0])
strs = data[1]
day = (int(strs[1])-1)*7 + int(strs[3])-1
list1.append(day)
sitestr = data[2]
site = int(sitestr[1:])-1
list1.append(site)
list1.append(int(data[3]))
return list1
data2 = data.map(dayToNum)
data4 = data3.reduceByKey(lambda v1,v2:list(map(lambda x: x[0]+x[1], zip(v2, v1))))
data4.saveAsTextFile("/home/fei/combinedData")
data5 = data4.map(lambda x:np.array(x[1]))
k = 100
mode = "k-means||"
model = KMeans.train(data5,k,initializationMode=mode)
output = open('/home/fei/kmeans_result.txt', 'w')
output.write("Final centers: " + str(model.clusterCenters))
output.write("Final centers: " + str(model.clusterCenters))
output.close()
print("Final centers: " + str(model.clusterCenters))
print("Total Cost: " + str(model.computeCost(data5)))
belongs= data5.map(lambda x:(model.predict(x),(list)(x)))
data6 = belongs.map(lambda x: (x[0],np.array(x[1]).reshape(8,7,10))).cache()
data00 = data6.map(lambda x:x[1]).map(lambda x:x[:,0,0])
data_train = data00.map(lambda x : LabeledPoint(x[6],x[:6]))
model = LinearRegressionWithSGD.train(data_train)
result = model.predict(data[1][1:7,0,0])
result.saveAsTextFile('/home/fei/lines')
sc.stop()