[Python] 从ip138网站爬取ip所处地点

最新推荐文章于 2022-10-18 10:19:30 发布

Cumu_

最新推荐文章于 2022-10-18 10:19:30 发布

阅读量6.3k

点赞数

CC 4.0 BY-SA版权

分类专栏： python python 文章标签： python 爬取ip所处地址 ip138

本文链接：https://blog.youkuaiyun.com/JThink_/article/details/28597757

python 同时被 2 个专栏收录

6 篇文章

订阅专栏

python

2 篇文章

订阅专栏

本文介绍了一种方法，通过批量下载IP数据，利用特定URL获取每个IP对应的位置信息，并将其存储到本地文件中。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1. 首先从纯真ip下载最新ip数据，地址：http://www.cz88.net/，数据格式是这样的

0.0.0.0 0.255.255.255 IANA保留地址 CZ88.NET
1.0.0.0 1.0.0.255 澳大利亚 CZ88.NET
1.0.1.0 1.0.3.255 福建省 电信
1.0.4.0 1.0.7.255 澳大利亚 CZ88.NET

2. 根据ip爬取该ip所处地点

import urllib.parse
import urllib.request
from html.parser import HTMLParser
import time

''' global constants
'''
START = ''
END = ''
LOCATION = ''
NET = ''

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.region = ''
    def handle_data(self, data):
        if data[0:5] == '本站主数据':
            #print(data[6:])
            self.region = data[6:]

''' remove the invalid space
'''
def remove_invalid_space(line):
    return line.split()

''' format one line
'''
def format_one_line(line):
    if len(line) == 6:
        net = line[-3] + line[-2] + line[-1]
    elif len(line) == 5:
        net = line[-2] + line[-1]
    else:
        net = line[-1]
    
    line_format = [line[0], line[1] , line[2], net]
    return line_format
    

''' get location from ip
'''
def get_location_from_ip(line):
    url = 'http://www.ip138.com/ips1388.asp'
    data = {'ip': line[0],
          'action': '2'}
    params = urllib.parse.urlencode(data)

    full_url = url + '?' + params
    response = urllib.request.urlopen(full_url)
    html = response.read().decode('GBK')

    parser = MyHTMLParser()
    parser.feed(html)
    parser.close()

    region = remove_invalid_space(parser.region)
    if len(region) == 1:
        location = region[0]
        net = ''
    else:
        location = region[0]
        net = region[-1]

    line_format = [line[0], line[1], location, net]
    global LOCATION
    global NET
    global START
    global END
    if LOCATION == location and NET == net:
        line_format_over_write = START + ' ' + str(line_format[1]) + ' ' + location + ' ' + net
        over_write_tmp_file(line_format_over_write)
    else:
        write_to_tmp_file(str(line_format[0]) + ' ' + str(line_format[1]) + ' ' + str(line_format[2]) + ' ' + str(line_format[3]))
        START = line[0]
        END = line[1]

    LOCATION = location
    NET = net

''' write to tmp file
'''
def write_to_tmp_file(line):
    try:
        file = open('ip_tmp.txt', 'a')
        file.write(line + '\n')
    except FileNotFoundError:
        print('file not found')
    finally:
        if 'file' in locals():
            file.close()

''' over write tmp file
'''
def over_write_tmp_file(line):
    try:
        file = open('ip_tmp.txt')
        lines = file.readlines()
        curr = lines[:-1]
    except FileNotFoundError:
        print('file not found')
    finally:
        if 'file' in locals():
            file.close()

    try:
        file = open('ip_tmp.txt', 'w')
        curr.append(line + '\n')
        file.writelines(curr)
    except FileNotFoundError:
        print('file not found')
    finally:
        if 'file' in locals():
            file.close()
            
def format_ip_file(path):
    try:
        file = open(path)
        for line in file:
            # main logic of get location from ip
            get_location_from_ip(format_one_line(remove_invalid_space(line)))
            time.sleep(0.1)
    except FileNotFoundError:
        print('file not found')
    finally:
        if 'file' in locals():
            file.close()

print('start')
format_ip_file('D:\workspace\Python\ip\ip.txt')
print('end', end = '')

3. 最好设置延时，要不然搞崩溃ip138