BeautifulSoup、requests、re模块案例实战:爬取jenkins页面在线机器和离线机器ip并存放入文件

# -*-coding:UTF-8-*-

import os, sys
import datetime
import requests
import re
from bs4 import BeautifulSoup as bs
import xml.etree.ElementTree as ET

if len(sys.argv) < 3:
    print
    "参数不足!!请传入url和文件名"
    exit(1)
url = sys.argv[1]
filename = sys.argv[2]

####处理时间######
currentdate = datetime.date.today()
year = currentdate.year
month = currentdate.month
day = currentdate.day
print("local time is :" + str(year) + "年" + str(month) + "月" + str(day) + "日")

####获取页面信息#########
print("url is " + url)
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
    'Cookie': 'Cookie'}

url_content = requests.get(url, headers=header)
url_content = url_content.content

content_set = bs(url_content, 'html.parser')
# print content_set

computers = content_set.find('table', class_='sortable pane bigtable')

computers = computers.findAll('tr')
# print computers


#####判断文件是否存在,存在删除#####
print
"local dir is :" + os.getcwd()
if os.path.exists(filename + "_" + str(month) + "_" + str(day) + ".info"):
    os.remove(filename + "_" + str(month) + "_" + str(day) + ".info")
if os.path.exists(filename + "_" + "offline" + "_" + str(month) + "_" + str(day) + ".info"):
    os.remove(filename + "_" + "offline" + "_" + str(month) + "_" + str(day) + ".info")


def get_ip(computer):
    computer_id = computer.get('id')
    if computer_id == None:
        return "none"
    id_find = re.findall('[0-9]+', computer_id)
    print
    id_find
    return id_find, computer_id


for computer in computers:
    for is_online in computer:

        png = is_online.find('img')
        # print png
        if str(png) == "None":
            continue
        png = png.get('alt')
        print
        "the png is :" + str(png)
        if str(png) == "[offline]":
            print
            "当前机器已下线"
            id_find, computer_id = get_ip(computer)
            if id_find == "none":
                continue
            with open(filename + "_" + "offline" + "_" + str(month) + "_" + str(day) + ".info", 'a+') as ip_info:
                ips = ".".join(str(its) for its in id_find)
                ip_info.write(ips + " " + "offline" + " " + url + "/" + str(computer_id) + "\n")

        else:
            id_find, computer_id = get_ip(computer)
            if id_find == "none":
                continue
            with open(filename + "_" + str(month) + "_" + str(day) + ".info", 'a+') as ip_info:
                ips = ".".join(str(its) for its in id_find)
                ip_info.write(ips + " " + "online" + " " + url + "/" + str(computer_id) + "\n")

scrapy爬虫升级版爬取ip信息:https://blog.youkuaiyun.com/qq_46020608/article/details/113132339

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值