BeautifulSoup、requests、re模块案例实战：爬取jenkins页面在线机器和离线机器ip并存放入文件

最新推荐文章于 2023-10-10 23:23:29 发布

原创最新推荐文章于 2023-10-10 23:23:29 发布 · 198 阅读

0 ·

CC 4.0 BY-SA版权

python实战同时被 3 个专栏收录

7 篇文章

订阅专栏

BeautiSoup

2 篇文章

订阅专栏

2 篇文章

订阅专栏

# -*-coding:UTF-8-*-

import os, sys
import datetime
import requests
import re
from bs4 import BeautifulSoup as bs
import xml.etree.ElementTree as ET

if len(sys.argv) < 3:
    print
    "参数不足！！请传入url和文件名"
    exit(1)
url = sys.argv[1]
filename = sys.argv[2]

####处理时间######
currentdate = datetime.date.today()
year = currentdate.year
month = currentdate.month
day = currentdate.day
print("local time is :" + str(year) + "年" + str(month) + "月" + str(day) + "日")

####获取页面信息#########
print("url is " + url)
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
    'Cookie': 'Cookie'}

url_content = requests.get(url, headers=header)
url_content = url_content.content

content_set = bs(url_content, 'html.parser')
# print content_set

computers = content_set.find('table', class_='sortable pane bigtable')

computers = computers.findAll('tr')
# print computers


#####判断文件是否存在，存在删除#####
print
"local dir is :" + os.getcwd()
if os.path.exists(filename + "_" + str(month) + "_" + str(day) + ".info"):
    os.remove(filename + "_" + str(month) + "_" + str(day) + ".info")
if os.path.exists(filename + "_" + "offline" + "_" + str(month) + "_" + str(day) + ".info"):
    os.remove(filename + "_" + "offline" + "_" + str(month) + "_" + str(day) + ".info")


def get_ip(computer):
    computer_id = computer.get('id')
    if computer_id == None:
        return "none"
    id_find = re.findall('[0-9]+', computer_id)
    print
    id_find
    return id_find, computer_id


for computer in computers:
    for is_online in computer:

        png = is_online.find('img')
        # print png
        if str(png) == "None":
            continue
        png = png.get('alt')
        print
        "the png is :" + str(png)
        if str(png) == "[offline]":
            print
            "当前机器已下线"
            id_find, computer_id = get_ip(computer)
            if id_find == "none":
                continue
            with open(filename + "_" + "offline" + "_" + str(month) + "_" + str(day) + ".info", 'a+') as ip_info:
                ips = ".".join(str(its) for its in id_find)
                ip_info.write(ips + " " + "offline" + " " + url + "/" + str(computer_id) + "\n")

        else:
            id_find, computer_id = get_ip(computer)
            if id_find == "none":
                continue
            with open(filename + "_" + str(month) + "_" + str(day) + ".info", 'a+') as ip_info:
                ips = ".".join(str(its) for its in id_find)
                ip_info.write(ips + " " + "online" + " " + url + "/" + str(computer_id) + "\n")

scrapy爬虫升级版爬取ip信息：https://blog.youkuaiyun.com/qq_46020608/article/details/113132339