addr.dat
1.0.1.1|1.0.1.2|16777471|16777472|亚洲|中国|山东|遥墙||电信|350100|China|CN|119.306239|26.075302
1.0.1.1|1.0.1.2|16777473|16777474|亚洲|中国|山东|王楼||电信|350100|China|CN|119.306239|26.075302
1.0.1.1|1.0.1.2|16777475|16777476|亚洲|中国|山东|济南||电信|350100|China|CN|119.306239|26.075302
1.0.1.2|1.0.1.3|16777477|16777478|亚洲|中国|山东|潍坊||电信|350100|China|CN|119.306239|26.075302
1.0.1.3|1.0.1.4|16777479|16777480|亚洲|中国|山东|烟台||电信|350100|China|CN|119.306239|26.075302
1.0.1.4|1.0.1.5|16777481|16777482|亚洲|中国|山东|菏泽||电信|350100|China|CN|119.306239|26.075302
1.0.1.5|1.0.1.6|16777483|16777484|亚洲|中国|山东|聊城||电信|350100|China|CN|119.306239|26.075302
1.0.1.6|1.0.1.7|16777485|16777486|亚洲|中国|山东|威海||电信|350100|China|CN|119.306239|26.075302
1.0.1.7|1.0.1.8|16777487|16777488|亚洲|中国|山东|青岛||电信|350100|China|CN|119.306239|26.075302
ip.log
20090121000132124542000|1.0.1.1|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.2|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.3|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.4|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.5|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.6|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.7|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.7|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
20090121000132124542000|1.0.1.8|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/
package com.ws.ip
import java.io.{BufferedReader, FileReader, InputStreamReader}
import java.net.URI
import java.util
import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.hadoop.hdfs.client.HdfsAdmin
import scala.io.{BufferedSource, Source}
object IpDictUtil {
val dict: util.ArrayList[String] = new util.ArrayList[String]()
// 静态代码快
// val source: BufferedSource = Source.fromFile("data/addr.dat")
// val reader: BufferedReader = source.bufferedReader()
var line: String = ""
private val system: FileSystem = FileSystem.get(URI.create("hdfs://dream1:9000"), new Configuration())
private val stream: FSDataInputStream = system.open(new Path("/test/addr.dat"))
private val reader = new BufferedReader(new InputStreamReader(stream))
do {
line = reader.readLine()
if (StringUtils.isNotBlank(line)) {
dict.add(line)
}
} while (line != null)
def ip2Long(ip: String): Long = {
val fragments = ip.split("[.]")
var ipNum = 0L
for (i <- 0 until fragments.length) {
ipNum = fragments(i).toLong | ipNum << 8L
}
ipNum
}
/**
* 使用递归的二分查找
* title:recursionBinarySearch
*
* @param key 待查找关键字
* @return 找到的位置
*/
def recursionBinarySearch(key: Long, low: Int = 0, high: Int = dict.size()): String = {
val mid = (low + high) / 2
if(low < high && low != mid){
val fields = dict.get(mid).split("[|]")
val b1 = fields(2).toLong
val b2 = fields(3).toLong
if (b1 <= key && key <= b2) {
fields(7)
} else {
if (b2 < key) {
recursionBinarySearch(key, mid, high)
} else {
recursionBinarySearch(key, low, mid)
}
}
}else{
"-1"
}
}
def main(args: Array[String]): Unit = {
println(recursionBinarySearch(16777485,0,6))
}
}
package com.ws.ip
import com.ws.orderCount.OrderCountToHbase
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.{Logger, LoggerFactory}
object IpLogGetAddress {
private val logger: Logger = LoggerFactory.getLogger(OrderCountToHbase.getClass)
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("OrderCountJoinDict").setMaster("local[*]")
val sc: SparkContext = new SparkContext(conf)
//说明:数据用|分隔,第一个字段是时间、第二个是 IP 地址、第三个是域名、第四个是访问的地址
val file: RDD[String] = sc.textFile("data/ip.log")
val resulterr: RDD[(String, Int)] = file.mapPartitions(it => {
it.map(line => {
val order = try {
val ip = line.split("[|]")(1)
val l = IpDictUtil.ip2Long(ip)
val str = IpDictUtil.recursionBinarySearch(l)
(str, 1)
} catch {
case e: Exception =>
e.printStackTrace()
logger.debug("gson err: " + line)
("-1",1)
}
order
})
})
val orders: RDD[(String, Int)] = resulterr.filter(x => !"-1".equalsIgnoreCase(x._1))
val value: RDD[(String, Int)] = orders.reduceByKey(_ + _)
println(value.collect().toBuffer)
}
}