一 wordCount案例
需求 : 统计文件中的单词个数按升序排列
数据准备
hadoop spark hbase hive java scala
hadoop spark hbase hive java scala
hadoop spark hbase hive java scala
hadoop spark hbase hive java scala
hadoop spark hbase hive java scala
hadoop spark hbase hive java scala
hadoop spark hbase hive java scala
hadoop spark hbase hive java scala
可以将数据保存到本地也可以在hdfs
scala程序编写
package cn.spark.demo1
import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
def main(args: Array[String]): Unit = {
//是否在本地运行
val isLocal = args(0).toBoolean
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
if (isLocal){
conf.setMaster("local[*]")
}
//创建sparkcontext对象
val sc = new SparkContext(conf)
//创建rdd导入数据
val lines = sc.textFile(args(1))
//将数据按空格切分压平
val words = lines.flatMap(_.split(" "))
//映射成元组value为1,用来计数
val wordsAndOne = words.map((_, 1))
//将单词分组聚合
val reduce = wordsAndOne.reduceByKey(_ + _)
//按升序排序
val sort= reduce.sortBy(_._2, false)
//生成导入到地址
sort.saveAsTextFile(args(2))
//关闭资源
sc.stop()
}
}
二 订单成交金额案例
数据准备
{"oid":"o123", "cid": 1, "money": 600.0, "longitude":116.397128,"latitude":39.916527}
"oid":"o112", "cid": 3, "money": 200.0, "longitude":118.396128,"latitude":35.916527}
{"oid":"o124", "cid": 2, "money": 200.0, "longitude":117.397128,"latitude":38.916527}
{"oid":"o125", "cid": 3, "money": 100.0, "longitude":118.397128,"latitude":35.916527}
{"oid":"o127", "cid": 1, "money": 100.0, "longitude":116.395128,"latitude":39.916527}
{"oid":"o128", "cid": 2, "money": 200.0, "longitude":117.396128,"latitude":38.916527}
{"oid":"o129", "cid": 3, "money": 300.0, "longitude":115.398128,"latitude":35.916527}
{"oid":"o130", "cid": 2, "money": 100.0, "longitude":116.397128,"latitude":39.916527}
{"oid":"o131", "cid": 1, "money": 100.0, "longitude":117.394128,"latitude":38.916527}
{"oid":"o132", "cid": 3, "money": 200.0, "longitude":118.396128,"latitude":35.916527}
商品
1 手机
2 家具
3 服装
需求 将数据解析求出以形式为
(手机 , 800)
(家具, 500)
(服装 , 600)
将结果存储在mysql中
数据以json解析,封装到javabean中提交到mysql中
package cn.spark.demo2
import java.sql.{Connection, Date, DriverManager, PreparedStatement}
import com.alibaba.fastjson.{JSON, JSONException}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.{Logger, LoggerFactory}
object OrderDemo {
private val logger: Logger= LoggerFactory.getLogger(this.getClass)
def main(args: Array[String]): Unit = {
//是否本地运行
val isLocal:Boolean = args(0).toBoolean
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
if (isLocal){
conf.setMaster("local[*]")
}
//创建SparkCongtext对象
val sc: SparkContext = new SparkContext(conf)
//创建RDD
val lines = sc.textFile(args(1))
//将lines读取到jsonbean
val tp: RDD[OrderBean1] = lines.map(line => {
var json:OrderBean1 = null
try {
json = JSON.parseObject(line, classOf[OrderBean1])
} catch {
case e:JSONException=> {
//将错误数据放到log中
logger.error("line is error "+line)
}
}
//返回json
json
})
//过滤空数据
val tp1 = tp.filter(_ != null)
//将json映射成元组方便join
val tup: RDD[(Int, Double)] = tp1.map(t => (t.cid, t.money))
//分组聚合
val reduced: RDD[(Int, Double)] = tup.reduceByKey(_ + _)
//创建新的RDD
val arr: RDD[(Int, String)] =sc.parallelize(Array((1,"手机"),(2,"家具"),(3,"服装")))
//两个数据进行join
val jin = reduced.join(arr).map(t => (t._2._2,t._2._1))
//将数据写到mysql
jin.foreachPartition(dataMysql)
//关闭资源
sc.stop()
}
//写进mysql 方法
val dataMysql = (it: Iterator[(String , Double)]) =>{
var conn: Connection = null
var ps : PreparedStatement = null
try {
//准备mysql数据
conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/scott?characterEncoding=UTF-8", "root", "123456")
//写sql语句对象
ps = conn.prepareStatement("insert into ordered1 values (null,?,?,?)")
//将数据set进数据表
it.foreach(t=> {
ps.setString(1,t._1)
ps.setDouble(2,t._2)
ps.setDate(3,new Date(System.currentTimeMillis()))
//批量写入
ps.addBatch()
})
//写入到数据库
ps.executeBatch()
//返回迭代器
()
} catch {
case e :Exception =>{}
}finally {
//关闭资源
if (conn!= null) conn.close()
if (ps!= null) ps.close()
}
}
}
package cn.spark.demo2
import scala.beans.BeanProperty
class OrderBean1 extends Serializable {
@BeanProperty
var oid: String = _
@BeanProperty
var cid: Int = _
@BeanProperty
var money: Double = _
@BeanProperty
var longitude: Double = _
@BeanProperty
var latitude: Double = _
override def toString = s"OrderBean($oid, $cid, $money, $longitude, $latitude)"
}