# -*- coding: utf-8 -*-
import json
import requests
import io
from bs4 import BeautifulSoup
import random
import time
def getUrlText(url):
headers = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3315.4 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'
]
header = {'User-Agent': random.choice(headers)}
rs = requests.get(url, headers=header)
rs.encoding = rs.apparent_encoding
return rs
def getIps(url):
rs1 = getUrlText(url)
rs1 = rs1.text
soup = BeautifulSoup(rs1, 'html.parser')
dictinfo = json.loads(rs1)
ips = []
for i in dictinfo['data']:
ips.append(i['ip'])
return ips
def getNewDomains(ip):
NewDomains = []
url1 = 'http://site.ip138.com/' + ip + '/'
rs1 = getUrlText(url1)
rs1 = rs1.text
soup = BeautifulSoup(rs1, 'html.parser')
domains = soup.select('#list > li')
for domain in domains:
try:
i = domain.select('a')[0].text
if '360' in i:
if i in NewDomains:
pass
else:
NewDomains.append(i)
except:
pass
return NewDomains
def loadDatadet(infile):
f = open(infile, 'r')
sourceInLine = f.readlines()
dataset = []
for line in sourceInLine:
line = line.replace('\n', '')
dataset.append(line)
dataset = list(set(dataset))
f.close()
return dataset
urlUnput = input('请输入要查询地址:')
urlall = 'http://site.ip138.com/' + urlUnput + '/'
rs1 = getUrlText(urlall)
t1 = str(int(time.time() * 1000))
print(rs1, t1)
url = 'http://site.ip138.com/domain/read.do?domain=' + urlUnput + '&time=' + t1
ips = getIps(url)
NewDomains = []
for ip in ips:
NewDomains.extend(getNewDomains(ip))
NewDomains = list(set(NewDomains))
ipsUrl = {}
ipsNew = []
for Domains in NewDomains:
try:
t2 = str(int(time.time() * 1000))
DomainsUrl = 'http://site.ip138.com/domain/read.do?domain=' + Domains + '&time=' + t2
ips = getIps(DomainsUrl)
ipsUrl.update({Domains: ips})
ipsNew.extend(ips)
ipsNew = list(set(ipsNew))
except:
pass
print(ipsNew)
try:
fn = urlUnput + r'.txt'
IpNewDomains = loadDatadet(fn)
except:
IpNewDomains = []
for ip1 in ipsNew:
for i in range(1, 255, 1):
try:
ipNew = ip1.split('.')[0] + '.' + ip1.split('.')[1] + '.' + ip1.split('.')[2] + '.' + str(i)
str2 = getNewDomains(ipNew)
str3 = str([ipNew, str2])
if str2:
if str3 not in IpNewDomains:
IpNewDomains.append([ipNew, str2])
print(str3)
with io.open(fn, 'a', encoding='utf-8') as f:
f.write(str3 + '\n')
else:
print(r'IP已存在!')
else:
pass
except:
pass
利用爬虫获取域名解析的IP
最新推荐文章于 2025-09-22 17:45:34 发布
本文介绍了一种利用Python进行网络爬虫的方法,包括使用多种User-Agent头避免被网站屏蔽,解析网页获取IP列表,以及进一步查询这些IP关联的域名。通过实例演示了如何处理大量数据,适用于对网络爬虫和IP管理感兴趣的读者。
756

被折叠的 条评论
为什么被折叠?



