#coding=utf-8
from pyspark import SparkConf,SparkContext
conf=SparkConf().setAppName("getiplocal").setMaster("local[*]")
sc=SparkContext(conf=conf)
#指定从哪里读取数据
#1.规则,ipnum
rdd1 = sc.textFile("hdfs://spark2:9000/hadoop/data/ip_pool.data")
#整理数据
def fun1(x):
a = x.split(",")
startNum = int(a[1])
endNum= int(a[3])
county = a[5]
return startNum,endNum,county,type(county)
rdd2 = rdd1.map(fun1)
iprules = rdd2.collect() #广播不能直接传入rdd,必须收集excutor中的数据以后才能传入driver
broad = sc.broadcast(iprules)
#ip2long函数
def ip2long(ip_string):
return ip_string.find('.') != -1 and reduce(lambda a, b: a << 8 | b,map(int,ip_string.split("."))) or ip_string
#binarysearch函数
def binary_search(tuple_list, ip):
left, right = 0, len(tuple_list) - 1
while left <= right:
mid = (left + right) / 2
if ip <=
pyspark获取ip地址的位置信息
最新推荐文章于 2024-08-06 04:28:52 发布