一、Hadoop分布式实例
在hdfs上创建hadoop用户,及创建input文件夹
./bin/hdfs dfs -mkdir -p /user/hadoop
./bin/hdfs dfs -mkdir ./input
上传文件并查看
./bin/hdfs dfs -put /home/hadoop/下载/*.txt input
./bin/hdfs dfs -ls
三表合一
# 读取文件
var a = sc.textFile("input/result_bigdata.txt").map{x=>val line=x.split("\t");(line(0),line(1),line(2).toInt)}
var b = sc.textFile("input/result_math.txt").map{x=>val line=x.split("\t");(line(0),line(1),line(2).toInt)}
# 联合a、b两表
var a_b = a union b
# 降阶操作
var total_score = a_b.map(x=>(x._1,x._3)).reduceByKey((x,y)=>x+y)
# 求平均数
var average = total_score.map{x=>(x._1,x._2/2)}
# 数学成绩
var math_score = b.map{x=>(x._1,x._3)}
# bigdata成绩
var bigdata_score = a.map{x=>(x._1,x._3)}
# 读取学生表
var c = sc.textFile("input/student.txt").map{x=>val line=x.split("\t");(line(0),line(1))}
# 联合数学成绩和bigdata成绩
var score1 = math_score.join(bigdata_score)
# 联合平均分和总分
var score2 = total_score.join(average)
# 联合前面两表
var score = score1.join(score2)
# 将学生表和最终成绩表联合
var flag = c.join(score)
# 看一下效果
flag.collect
var a = sc.textFile("input/result_bigdata.txt").map{x=>val line=x.split("\t");(line(0),line(1),line(2).toInt)}
var b = sc.textFile("input/result_math.txt").map{x=>val line=x.split("\t");(line(0),line(1),line(2).toInt)}
var a_b = a union b
var total_score = a_b.map(x=>(x._1,x._3)).reduceByKey((x,y)=>x+y)
var average = total_score.map{x=>(x._1,x._2/2)}
var math_score = b.map{x=>(x._1,x._3)}
var bigdata_score = a.map{x=>(x._1,x._3)}
var c = sc.textFile("input/student.txt").map{x=>val line=x.split("\t");(line(0),line(1))}
var score1 = math_score.join(bigdata_score)
var score2 = total_score.join(average)
var score = score1.join(score2)
var flag = c.join(score)
flag.collect
二、spark支持hive的版本
问题:
解决方案:
资料:Spark入门:连接Hive读写数据(DataFrame)
安装编译后的spark版本
sudo tar -zxf ~/下载/spark-2.1.0-bin-h27hive.tgz -C /usr/local
修改文件名
sudo mv /usr/local/spark-2.1.0-bin-h27hive /usr/local/sparkwithhive
修改文件权限
sudo chown -R hadoop:hado