# -*-coding:UTF-8-*-
import os, sys
import datetime
import requests
import re
from bs4 import BeautifulSoup as bs
import xml.etree.ElementTree as ET
if len(sys.argv) < 3:
print
"参数不足!!请传入url和文件名"
exit(1)
url = sys.argv[1]
filename = sys.argv[2]
####处理时间######
currentdate = datetime.date.today()
year = currentdate.year
month = currentdate.month
day = currentdate.day
print("local time is :" + str(year) + "年" + str(month) + "月" + str(day) + "日")
####获取页面信息#########
print("url is " + url)
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
'Cookie': 'Cookie'}
url_content = requests.get(url, headers=header)
url_content = url_content.content
content_set = bs(url_content, 'html.parser')
# print content_set
computers = content_set.find('table', class_='sortable pane bigtable')
computers = computers.findAll('tr')
# print computers
#####判断文件是否存在,存在删除#####
print
"local dir is :" + os.getcwd()
if os.path.exists(filename + "_" + str(month) + "_" + str(day) + ".info"):
os.remove(filename + "_" + str(month) + "_" + str(day) + ".info")
if os.path.exists(filename + "_" + "offline" + "_" + str(month) + "_" + str(day) + ".info"):
os.remove(filename + "_" + "offline" + "_" + str(month) + "_" + str(day) + ".info")
def get_ip(computer):
computer_id = computer.get('id')
if computer_id == None:
return "none"
id_find = re.findall('[0-9]+', computer_id)
print
id_find
return id_find, computer_id
for computer in computers:
for is_online in computer:
png = is_online.find('img')
# print png
if str(png) == "None":
continue
png = png.get('alt')
print
"the png is :" + str(png)
if str(png) == "[offline]":
print
"当前机器已下线"
id_find, computer_id = get_ip(computer)
if id_find == "none":
continue
with open(filename + "_" + "offline" + "_" + str(month) + "_" + str(day) + ".info", 'a+') as ip_info:
ips = ".".join(str(its) for its in id_find)
ip_info.write(ips + " " + "offline" + " " + url + "/" + str(computer_id) + "\n")
else:
id_find, computer_id = get_ip(computer)
if id_find == "none":
continue
with open(filename + "_" + str(month) + "_" + str(day) + ".info", 'a+') as ip_info:
ips = ".".join(str(its) for its in id_find)
ip_info.write(ips + " " + "online" + " " + url + "/" + str(computer_id) + "\n")
scrapy爬虫升级版爬取ip信息:https://blog.youkuaiyun.com/qq_46020608/article/details/113132339