Spark-Hdfs

 addr.dat

1.0.1.1|1.0.1.2|16777471|16777472|亚洲|中国|山东|遥墙||电信|350100|China|CN|119.306239|26.075302
1.0.1.1|1.0.1.2|16777473|16777474|亚洲|中国|山东|王楼||电信|350100|China|CN|119.306239|26.075302
1.0.1.1|1.0.1.2|16777475|16777476|亚洲|中国|山东|济南||电信|350100|China|CN|119.306239|26.075302
1.0.1.2|1.0.1.3|16777477|16777478|亚洲|中国|山东|潍坊||电信|350100|China|CN|119.306239|26.075302
1.0.1.3|1.0.1.4|16777479|16777480|亚洲|中国|山东|烟台||电信|350100|China|CN|119.306239|26.075302
1.0.1.4|1.0.1.5|16777481|16777482|亚洲|中国|山东|菏泽||电信|350100|China|CN|119.306239|26.075302
1.0.1.5|1.0.1.6|16777483|16777484|亚洲|中国|山东|聊城||电信|350100|China|CN|119.306239|26.075302
1.0.1.6|1.0.1.7|16777485|16777486|亚洲|中国|山东|威海||电信|350100|China|CN|119.306239|26.075302
1.0.1.7|1.0.1.8|16777487|16777488|亚洲|中国|山东|青岛||电信|350100|China|CN|119.306239|26.075302

ip.log

20090121000132124542000|1.0.1.1|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.2|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.3|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.4|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.5|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.6|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.7|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.7|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.8|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/

 

package com.ws.ip

import java.io.{BufferedReader, FileReader, InputStreamReader}
import java.net.URI
import java.util

import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.hadoop.hdfs.client.HdfsAdmin

import scala.io.{BufferedSource, Source}

object IpDictUtil {
  val dict: util.ArrayList[String] = new util.ArrayList[String]()
  // 静态代码快
//  val source: BufferedSource = Source.fromFile("data/addr.dat")
//  val reader: BufferedReader = source.bufferedReader()
  var line: String = ""
  private val system: FileSystem = FileSystem.get(URI.create("hdfs://dream1:9000"), new Configuration())
  private val stream: FSDataInputStream = system.open(new Path("/test/addr.dat"))
  private val reader = new BufferedReader(new InputStreamReader(stream))
  do {
    line = reader.readLine()
    if (StringUtils.isNotBlank(line)) {
      dict.add(line)
    }
  } while (line != null)
  def ip2Long(ip: String): Long = {
    val fragments = ip.split("[.]")
    var ipNum = 0L
    for (i <- 0 until fragments.length) {
      ipNum = fragments(i).toLong | ipNum << 8L
    }
    ipNum
  }

  /**
   * 使用递归的二分查找
   * title:recursionBinarySearch
   *
   * @param key 待查找关键字
   * @return 找到的位置
   */
  def recursionBinarySearch(key: Long, low: Int = 0, high: Int = dict.size()): String = {
    val mid = (low + high) / 2
    if(low < high && low != mid){
      val fields = dict.get(mid).split("[|]")
      val b1 = fields(2).toLong
      val b2 = fields(3).toLong
      if (b1 <= key && key <= b2) {
        fields(7)
      } else {
        if (b2 < key) {
          recursionBinarySearch(key, mid, high)
        } else {
          recursionBinarySearch(key, low, mid)
        }
      }
    }else{
      "-1"
    }
  }

  def main(args: Array[String]): Unit = {
    println(recursionBinarySearch(16777485,0,6))
  }
}
package com.ws.ip

import com.ws.orderCount.OrderCountToHbase
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.{Logger, LoggerFactory}

object IpLogGetAddress {

  private val logger: Logger = LoggerFactory.getLogger(OrderCountToHbase.getClass)

  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName("OrderCountJoinDict").setMaster("local[*]")
    val sc: SparkContext = new SparkContext(conf)
    //说明:数据用|分隔,第一个字段是时间、第二个是 IP 地址、第三个是域名、第四个是访问的地址
    val file: RDD[String] = sc.textFile("data/ip.log")
    val resulterr: RDD[(String, Int)] = file.mapPartitions(it => {
      it.map(line => {
        val order = try {
          val ip = line.split("[|]")(1)
          val l = IpDictUtil.ip2Long(ip)
          val str = IpDictUtil.recursionBinarySearch(l)
          (str, 1)
        } catch {
          case e: Exception =>
            e.printStackTrace()
            logger.debug("gson err: " + line)
            ("-1",1)
        }
        order
      })
    })
    val orders: RDD[(String, Int)] = resulterr.filter(x => !"-1".equalsIgnoreCase(x._1))
    val value: RDD[(String, Int)] = orders.reduceByKey(_ + _)
    println(value.collect().toBuffer)
  }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值