spark读取hbase数据

本文介绍了一种使用 Apache Spark 和 HBase 进行数据读取的方法,并展示了如何从 HBase 表中扫描特定数据行,同时提取用户基本信息、关注者统计特征及标签信息等数据。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

  1. import org.apache.hadoop.hbase.HBaseConfiguration  
  2. import org.apache.hadoop.hbase.mapreduce.TableInputFormat  
  3. import org.apache.spark.sql.SparkSession  
  4. import scala.collection.mutable  
  5.   
  6. object HbaseUtils {  
  7.   
  8.   /** 
  9.     * 指定HBase 表 字段 开始结束 rowkey scan 
  10.     * 
  11.     * @param spark 
  12.     * @param tableName 
  13.     * @param columns 
  14.     * @param startRow 
  15.     * @param endRow 
  16.     * @return 
  17.     */  
  18.   def scan(spark: SparkSession, tableName: String, columns: mutable.ArrayBuffer[String], startRow: String = null, endRow: String = null) = {  
  19.   
  20.     val scan_columns = columns.mkString(" ")  
  21.   
  22.     val scanConf = HBaseConfiguration.create()  
  23.     scanConf.set(TableInputFormat.INPUT_TABLE, tableName)  
  24.     if(columns.length == 0){  
  25.       scanConf.set(TableInputFormat.SCAN_COLUMNS, scan_columns)  
  26.     }  
  27.     if(StringUtils.isNotEmpty(startRow)){  
  28.       scanConf.set(TableInputFormat.SCAN_ROW_START, startRow)  
  29.     }  
  30.     if(StringUtils.isNotEmpty(endRow)){  
  31.       scanConf.set(TableInputFormat.SCAN_ROW_STOP, endRow)  
  32.     }  
  33.     scanConf.set("mapreduce.task.timeout", "1200000")  
  34.     scanConf.set("hbase.client.scanner.timeout.period", "600000")  
  35.     scanConf.set("hbase.rpc.timeout", "600000")  
  36.   
  37.     val hBaseRDD = spark.sparkContext.newAPIHadoopRDD(scanConf, classOf[TableInputFormat],  
  38.       classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],  
  39.       classOf[org.apache.hadoop.hbase.client.Result])  
  40.   
  41.     hBaseRDD  
  42.   }  
  43. }  
  44.   
  45.   
  46. import org.apache.hadoop.hbase.client.Result  
  47. import org.apache.hadoop.hbase.util.Bytes  
  48. import org.apache.spark.SparkConf  
  49. import org.apache.spark.sql.SparkSession  
  50. import org.apache.log4j.{Level, Logger}  
  51. import scala.collection.mutable  
  52. import org.apache.spark.sql.functions.{col, split}  
  53.   
  54. object MakeBaseInfo {  
  55.   
  56.   val BaseInfo: Array[String] = Array("uid", "age", "height", "weight", "role", "vbadge",  
  57.     "has_photos", "video_verified", "is_human_face","has_description", "ip_location","has_avatar",  
  58.     "vip")  
  59.   
  60.   val Follow: Array[String] = Array("followed_num", "followed_with_time", "follower_num")  
  61.   
  62.   val StatisFeature:Array[String] = Array("click","clicked","show","send_session",  
  63.     "receive_session","desc_len","online_click","online_clicked","online_show","online_showed",  
  64.     "nearby_click","nearby_clicked","nearby_show","nearby_showed","newbie_click","newbie_clicked",  
  65.     "newbie_show","newbie_showed","social_stay_time","visit_count","visited_count")  
  66.   
  67.   val TagInfo: Array[String] = Array("" +  
  68.     "tag_1_1", "tag_1_2", "tag_1_3", "tag_1_4",  
  69.     "tag_2_1", "tag_2_2", "tag_2_3", "tag_2_4", "tag_2_5", "tag_2_6", "tag_2_7", "tag_2_8", "tag_2_9", "tag_2_10", "tag_2_11", "tag_2_12",  
  70.     "tag_3_1", "tag_3_2", "tag_3_3", "tag_3_4",  
  71.     "tag_4_1", "tag_4_2", "tag_4_3", "tag_4_4", "tag_4_5", "tag_4_6", "tag_4_7", "tag_4_8", "tag_4_9", "tag_4_10", "tag_4_11", "tag_4_12",  
  72.     "tag_5_1", "tag_5_2", "tag_5_3", "tag_5_4", "tag_5_5")  
  73.   
  74.   val humanFeature: Array[String] = Array("max_ratio", "max_beauty", "is_human_body")  
  75.   
  76.   def main(args: Array[String]): Unit = {  
  77.   
  78.     val spark = getSparkSql  
  79.   
  80.     import spark.implicits._  
  81.   
  82.     val online_feature = getBaseUsersInfo(spark).toDF("uid", "item")  
  83.   
  84.     val online_feature_csv = online_feature.select(online_feature("uid"),  
  85.       split(col("item"),",").getItem(0).as("age"),  
  86.       split(col("item"),",").getItem(1).as("height"),  
  87.       split(col("item"),",").getItem(2).as("weight"),  
  88.       split(col("item"),",").getItem(3).as("role"),  
  89.       split(col("item"),",").getItem(4).as("vbadge"),  
  90.       split(col("item"),",").getItem(5).as("has_photos"),  
  91.       split(col("item"),",").getItem(6).as("video_verified"),  
  92.       split(col("item"),",").getItem(7).as("is_human_face"),  
  93.       split(col("item"),",").getItem(8).as("has_description"),  
  94.       split(col("item"),",").getItem(9).as("ip_location"),  
  95.       split(col("item"),",").getItem(10).as("has_avatar"),  
  96.       split(col("item"),",").getItem(11).as("vip"),  
  97.       split(col("item"),",").getItem(12).as("followed_num"),  
  98.       split(col("item"),",").getItem(13).as("follower_num"),  
  99.       split(col("item"),",").getItem(14).as("click"),  
  100.       split(col("item"),",").getItem(15).as("clicked"),  
  101.       split(col("item"),",").getItem(16).as("show"),  
  102.       split(col("item"),",").getItem(17).as("send_session"),  
  103.       split(col("item"),",").getItem(18).as("receive_session"),  
  104.       split(col("item"),",").getItem(19).as("desc_len"),  
  105.       split(col("item"),",").getItem(20).as("online_click"),  
  106.       split(col("item"),",").getItem(21).as("online_clicked"),  
  107.       split(col("item"),",").getItem(22).as("online_show"),  
  108.       split(col("item"),",").getItem(23).as("online_showed"),  
  109.       split(col("item"),",").getItem(24).as("nearby_click"),  
  110.       split(col("item"),",").getItem(25).as("nearby_clicked"),  
  111.       split(col("item"),",").getItem(26).as("nearby_show"),  
  112.       split(col("item"),",").getItem(27).as("nearby_showed"),  
  113.       split(col("item"),",").getItem(28).as("newbie_click"),  
  114.       split(col("item"),",").getItem(29).as("newbie_clicked"),  
  115.       split(col("item"),",").getItem(30).as("newbie_show"),  
  116.       split(col("item"),",").getItem(31).as("newbie_showed"),  
  117.       split(col("item"),",").getItem(32).as("social_stay_time"),  
  118.       split(col("item"),",").getItem(33).as("visit_count"),  
  119.       split(col("item"),",").getItem(34).as("visited_count"),  
  120.       split(col("item"),",").getItem(35).as("tag_1_1"),  
  121.       split(col("item"),",").getItem(36).as("tag_1_2"),  
  122.       split(col("item"),",").getItem(37).as("tag_1_3"),  
  123.       split(col("item"),",").getItem(38).as("tag_1_4"),  
  124.       split(col("item"),",").getItem(39).as("tag_2_1"),  
  125.       split(col("item"),",").getItem(40).as("tag_2_2"),  
  126.       split(col("item"),",").getItem(41).as("tag_2_3"),  
  127.       split(col("item"),",").getItem(42).as("tag_2_4"),  
  128.       split(col("item"),",").getItem(43).as("tag_2_5"),  
  129.       split(col("item"),",").getItem(44).as("tag_2_6"),  
  130.       split(col("item"),",").getItem(45).as("tag_2_7"),  
  131.       split(col("item"),",").getItem(46).as("tag_2_8"),  
  132.       split(col("item"),",").getItem(47).as("tag_2_9"),  
  133.       split(col("item"),",").getItem(48).as("tag_2_10"),  
  134.       split(col("item"),",").getItem(49).as("tag_2_11"),  
  135.       split(col("item"),",").getItem(50).as("tag_2_12"),  
  136.       split(col("item"),",").getItem(51).as("tag_3_1"),  
  137.       split(col("item"),",").getItem(52).as("tag_3_2"),  
  138.       split(col("item"),",").getItem(53).as("tag_3_3"),  
  139.       split(col("item"),",").getItem(54).as("tag_3_4"),  
  140.       split(col("item"),",").getItem(55).as("tag_4_1"),  
  141.       split(col("item"),",").getItem(56).as("tag_4_2"),  
  142.       split(col("item"),",").getItem(57).as("tag_4_3"),  
  143.       split(col("item"),",").getItem(58).as("tag_4_4"),  
  144.       split(col("item"),",").getItem(59).as("tag_4_5"),  
  145.       split(col("item"),",").getItem(60).as("tag_4_6"),  
  146.       split(col("item"),",").getItem(61).as("tag_4_7"),  
  147.       split(col("item"),",").getItem(62).as("tag_4_8"),  
  148.       split(col("item"),",").getItem(63).as("tag_4_9"),  
  149.       split(col("item"),",").getItem(64).as("tag_4_10"),  
  150.       split(col("item"),",").getItem(65).as("tag_4_11"),  
  151.       split(col("item"),",").getItem(66).as("tag_4_12"),  
  152.       split(col("item"),",").getItem(67).as("tag_5_1"),  
  153.       split(col("item"),",").getItem(68).as("tag_5_2"),  
  154.       split(col("item"),",").getItem(69).as("tag_5_3"),  
  155.       split(col("item"),",").getItem(70).as("tag_5_4"),  
  156.       split(col("item"),",").getItem(71).as("tag_5_5"))  
  157.   
  158.     online_feature_csv.write.format("csv").option("header","true").save("/data/wangtao/test/")  
  159.     spark.stop()  
  160.   
  161.   }  
  162.   
  163.   def getSparkSql: SparkSession = {  
  164.   
  165.     val JobName="aaa"  
  166.     val parallelism="100"  
  167.   
  168.     Logger.getLogger("org").setLevel(Level.WARN)  
  169.   
  170.     val sparkConf = new SparkConf().setAppName(JobName)  
  171.     sparkConf.set("spark.default.parallelism", parallelism)  
  172.     sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")  
  173.     sparkConf.set("spark.hadoop.validateOutputSpecs", "false")  
  174.   
  175.     SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()  
  176.   }  
  177.   
  178.   private def getBaseUsersInfo(spark: SparkSession)= {  
  179.   
  180.     val columns = mutable.ArrayBuffer[String]()  
  181.   
  182.     for (field <- BaseInfo) {  
  183.       columns.append("f1:%s".format(field))  
  184.     }  
  185.     for (field <- Follow) {  
  186.       columns.append("f1:%s".format(field))  
  187.     }  
  188.     for (field <- TagInfo) {  
  189.       columns.append("f1:%s".format(field))  
  190.     }  
  191.   
  192.     for (field <- StatisFeature) {  
  193.       columns.append("f1:%s".format(field))  
  194.     }  
  195.   
  196.     HbaseUtils.scan(spark, "online_social_feature", columns)  
  197.       .map {  
  198.         case (_, result) =>  
  199.   
  200.           //通过列族和列名获取列  
  201.           val uid = Bytes.toString(result.getRow)  
  202.   
  203.           // 收集数据  
  204.           val features = mutable.ArrayBuffer[String]()  
  205.   
  206.           val BaseInfo: Array[String] = Array("uid", "age", "height", "weight", "role", "vbadge",  
  207.             "has_photos", "video_verified", "is_human_face","has_description", "ip_location","has_avatar",  
  208.             "vip")  
  209.   
  210.           val Follow: Array[String] = Array("followed_num","follower_num")  
  211.   
  212.           val StatisFeature:Array[String] = Array("click","clicked","show","send_session",  
  213.             "receive_session","desc_len","online_click","online_clicked","online_show","online_showed",  
  214.             "nearby_click","nearby_clicked","nearby_show","nearby_showed","newbie_click","newbie_clicked",  
  215.             "newbie_show","newbie_showed","social_stay_time","visit_count","visited_count")  
  216.   
  217.           val TagInfo: Array[String] = Array("" +  
  218.             "tag_1_1", "tag_1_2", "tag_1_3", "tag_1_4",  
  219.             "tag_2_1", "tag_2_2", "tag_2_3", "tag_2_4", "tag_2_5", "tag_2_6", "tag_2_7", "tag_2_8", "tag_2_9", "tag_2_10", "tag_2_11", "tag_2_12",  
  220.             "tag_3_1", "tag_3_2", "tag_3_3", "tag_3_4",  
  221.             "tag_4_1", "tag_4_2", "tag_4_3", "tag_4_4", "tag_4_5", "tag_4_6", "tag_4_7", "tag_4_8", "tag_4_9", "tag_4_10", "tag_4_11", "tag_4_12",  
  222.             "tag_5_1", "tag_5_2", "tag_5_3", "tag_5_4", "tag_5_5")  
  223.   
  224.           val humanFeature: Array[String] = Array("max_ratio", "max_beauty", "is_human_body")  
  225.   
  226.           takeRowValue(result, features, "f1", "age", "None")  
  227.           takeRowValue(result, features, "f1", "height", "None")  
  228.           takeRowValue(result, features, "f1", "weight", "None")  
  229.           takeRowValue(result, features, "f1", "role", "-1")  
  230.           takeRowValue(result, features, "f1", "vbadge", "0")  
  231.           takeRowValue(result, features, "f1", "has_photos", "0")  
  232.           takeRowValue(result, features, "f1", "video_verified", "0")  
  233.           takeRowValue(result, features, "f1", "is_human_face", "None")  
  234.           takeRowValue(result, features, "f1", "has_description", "0")  
  235.           takeRowValue(result, features, "f1", "ip_location", "None")  
  236.           takeRowValue(result, features, "f1", "has_avatar", "None")  
  237.           takeRowValue(result, features, "f1", "vip", "None")  
  238.           takeRowValue(result, features, "f1", "followed_num", "0")  
  239.           takeRowValue(result, features, "f1", "follower_num", "0")  
  240.           takeRowValue(result, features, "f1", "click", "None")  
  241.           takeRowValue(result, features, "f1", "clicked", "None")  
  242.           takeRowValue(result, features, "f1", "show", "None")  
  243.           takeRowValue(result, features, "f1", "send_session", "None")  
  244.           takeRowValue(result, features, "f1", "receive_session", "None")  
  245.           takeRowValue(result, features, "f1", "desc_len", "None")  
  246.           takeRowValue(result, features, "f1", "online_click", "None")  
  247.           takeRowValue(result, features, "f1", "online_clicked", "None")  
  248.           takeRowValue(result, features, "f1", "online_show", "None")  
  249.           takeRowValue(result, features, "f1", "online_showed", "None")  
  250.           takeRowValue(result, features, "f1", "nearby_click", "None")  
  251.           takeRowValue(result, features, "f1", "nearby_clicked", "None")  
  252.           takeRowValue(result, features, "f1", "nearby_show", "None")  
  253.           takeRowValue(result, features, "f1", "nearby_showed", "None")  
  254.           takeRowValue(result, features, "f1", "newbie_click", "None")  
  255.           takeRowValue(result, features, "f1", "newbie_clicked", "None")  
  256.           takeRowValue(result, features, "f1", "newbie_show", "None")  
  257.           takeRowValue(result, features, "f1", "newbie_showed", "None")  
  258.           takeRowValue(result, features, "f1", "social_stay_time", "None")  
  259.           takeRowValue(result, features, "f1", "visit_count", "None")  
  260.           takeRowValue(result, features, "f1", "visited_count", "None")  
  261.           takeRowValue(result, features, "f1", "tag_1_1", "0")  
  262.           takeRowValue(result, features, "f1", "tag_1_2", "0")  
  263.           takeRowValue(result, features, "f1", "tag_1_3", "0")  
  264.           takeRowValue(result, features, "f1", "tag_1_4", "0")  
  265.           takeRowValue(result, features, "f1", "tag_2_1", "0")  
  266.           takeRowValue(result, features, "f1", "tag_2_2", "0")  
  267.           takeRowValue(result, features, "f1", "tag_2_3", "0")  
  268.           takeRowValue(result, features, "f1", "tag_2_4", "0")  
  269.           takeRowValue(result, features, "f1", "tag_2_5", "0")  
  270.           takeRowValue(result, features, "f1", "tag_2_6", "0")  
  271.           takeRowValue(result, features, "f1", "tag_2_7", "0")  
  272.           takeRowValue(result, features, "f1", "tag_2_8", "0")  
  273.           takeRowValue(result, features, "f1", "tag_2_9", "0")  
  274.           takeRowValue(result, features, "f1", "tag_2_10", "0")  
  275.           takeRowValue(result, features, "f1", "tag_2_11", "0")  
  276.           takeRowValue(result, features, "f1", "tag_2_12", "0")  
  277.           takeRowValue(result, features, "f1", "tag_3_1", "0")  
  278.           takeRowValue(result, features, "f1", "tag_3_2", "0")  
  279.           takeRowValue(result, features, "f1", "tag_3_3", "0")  
  280.           takeRowValue(result, features, "f1", "tag_3_4", "0")  
  281.           takeRowValue(result, features, "f1", "tag_4_1", "0")  
  282.           takeRowValue(result, features, "f1", "tag_4_2", "0")  
  283.           takeRowValue(result, features, "f1", "tag_4_3", "0")  
  284.           takeRowValue(result, features, "f1", "tag_4_4", "0")  
  285.           takeRowValue(result, features, "f1", "tag_4_5", "0")  
  286.           takeRowValue(result, features, "f1", "tag_4_6", "0")  
  287.           takeRowValue(result, features, "f1", "tag_4_7", "0")  
  288.           takeRowValue(result, features, "f1", "tag_4_8", "0")  
  289.           takeRowValue(result, features, "f1", "tag_4_9", "0")  
  290.           takeRowValue(result, features, "f1", "tag_4_10", "0")  
  291.           takeRowValue(result, features, "f1", "tag_4_11", "0")  
  292.           takeRowValue(result, features, "f1", "tag_4_12", "0")  
  293.           takeRowValue(result, features, "f1", "tag_5_1", "0")  
  294.           takeRowValue(result, features, "f1", "tag_5_2", "0")  
  295.           takeRowValue(result, features, "f1", "tag_5_3", "0")  
  296.           takeRowValue(result, features, "f1", "tag_5_4", "0")  
  297.           takeRowValue(result, features, "f1", "tag_5_5", "0")  
  298.   
  299. //          uid + "," + features.mkString(",")  
  300.           (uid,features.mkString(","))  
  301.       }  
  302.   }  
  303.   
  304.   def takeRowValue(result: Result, features: mutable.ArrayBuffer[String],  
  305.                    cf: String, field: String, default: String): Unit = {  
  306.     var value = Bytes.toString(result.getValue(cf.getBytes, field.getBytes))  
  307.     if (filedValueIsEmpty(value)) {  
  308.       value = default  
  309.     }  
  310.     features.append(s"$value")  
  311.   }  
  312.   
  313.   def filedValueIsEmpty(value: String): Boolean = {  
  314.     if (value == null || StringUtils.isEmpty(value) || value == "NULL" || value == "null" || value == "None")  
  315.       return true  
  316.     false  
  317.   }  
  318.   
  319. }  
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值