python实现网站TDK扫描

 

import re
import urllib3
import os
import yagmail
import requests
import logging
from concurrent.futures import ThreadPoolExecutor
import time
import threading
import xlwt
import xlrd
import socket
from xlutils.copy import copy
from urllib3.exceptions import InsecureRequestWarning
 
# 禁用安全请求警告
urllib3.disable_warnings(InsecureRequestWarning)
 
# 获取状态码、标题
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.6613.85 Safari/537.36',
}
 
def get_ip(url):
    url = url.strip('\n').replace('http://', '').replace('https://', '')
    myaddr = socket.getaddrinfo(url, 'http')
    return myaddr[0][4][0]
 
def get_codetitle(url):
    code = "无法访问"
    title = " "
    resurl = " "
    description = " "
    keywords = " "
    try:
        res = requests.get(url, headers=header, verify=False, allow_redirects=True, timeout=(3, 12))
        res.encoding = res.apparent_encoding
        code = res.status_code
        title = re.findall(r"(?<=\<title\>)(?:.|\n)+?(?=\<)", res.text, re.IGNORECASE)[0].strip()
        description = re.findall(r"(?<=\<meta name=\"description\" content=\")(?:.|\n)+?(?=\" \/\>|\"\/\>)", res.text, re.IGNORECASE)[0].strip()
        keywords = re.findall(r"(?<=\<meta name=\"keywords\" content=\")(?:.|\n)+?(?=\" \/\>|\"\/\>)", res.text, re.IGNORECASE)[0].strip()
        resurl = res.url
    except requests.RequestException as error:
        logging.error('%s网址无效或者IP被封锁: %s', url, error)
 
    try:
        ip = get_ip(url)
    except socket.error as error:
        logging.error('获取IP失败: %s', error)
        ip = 'null'
 
    return resurl, code, title, description, keywords, ip
 
def write(url):
    codetitle = get_codetitle(url)
    resurl, code, title, description, keywords, ip = map(str, codetitle)
    logging.info('%s | %s | %s | %s', url, code, title, ip)
    with lock:
        with xlrd.open_workbook(os.path.join(path, savefilename + '.xls')) as word_book:
            sheets = word_book.sheet_names()
            work_sheet = word_book.sheet_by_name(sheets[0])
            old_rows = work_sheet.nrows
            new_work_book = copy(word_book)
            new_sheet = new_work_book.get_sheet(0)
            i = old_rows
            new_sheet.write(i, 0, url)
            new_sheet.write(i, 1, resurl)
            new_sheet.write(i, 2, code)
            new_sheet.write(i, 3, title)
            new_sheet.write(i, 4, description)
            new_sheet.write(i, 5, keywords)
            new_sheet.write(i, 6, ip)
            new_work_book.save(os.path.join(path, savefilename + '.xls'))
 
def process_urls(input_file, output_file):
    with open(input_file, "r") as f:
        lines = f.readlines()
 
    with open(output_file, "w") as f2:
        for line in lines:
            line = line.strip('\n').strip()
            if not line.startswith('http://') and not line.startswith('https://'):
                f2.write('http://' + line + '\n')
            else:
                f2.write(line + '\n')
 
def send_email(duration):
    try:
        yag = yagmail.SMTP(user=os.getenv("EMAIL_USER"), password=os.getenv("EMAIL_PASS"), host='smtp.qq.com', port=465)
        contents = [f'TDK获取时间:{duration}秒']
        subject = 'TDK获取完成通知'
        receiver = ["705276383@qq.com"]
        yag.send(to=receiver, subject=subject, contents=contents)
        yag.close()
    except Exception as error:
        logging.error('发送邮件失败: %s', error)
 
if __name__ == "__main__":
    n = 0
    path = os.getcwd()
    logging.captureWarnings(True)
    logging.basicConfig(level=logging.INFO)
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
    start = time.time()
    lock = threading.Lock()
    savefilename = time.strftime("%Y-%m-%d %H.%M.%S")
    myxls = xlwt.Workbook()
    sheet1 = myxls.add_sheet(u'title', cell_overwrite_ok=True)
    sheet1.write(0, 0, "源地址")
    sheet1.write(0, 1, "跳转地址")
    sheet1.write(0, 2, "状态码")
    sheet1.write(0, 3, "标题")
    sheet1.write(0, 4, "描述")
    sheet1.write(0, 5, "关键词")
    sheet1.write(0, 6, "IP")
    myxls.save(os.path.join(path, savefilename + '.xls'))
 
    process_urls(os.path.join(path, "url.txt"), os.path.join(path, "url-run.txt"))
 
    with open(os.path.join(path, 'url-run.txt'), 'r', encoding='utf-8') as f:
        urls_data = [data.strip().strip('\\') for data in f]
 
    with ThreadPoolExecutor(max_workers=100) as executor:
        for url in urls_data:
            executor.submit(write, url=url)
 
    end = time.time()
    logging.info("总耗时: %s 秒", end - start)
 
    send_email(end - start)

以上python代码是基于python3.13版,后续如有其他版本升级可自行调整。
注意事项:需要在代码同级目录下建一个url.txt文档,将需要扫描的地址填入,每个地址单独一行。扫描完之后会生成一个excel表文档,一般会在代码的同级目录下,以当前日期和时间命名。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值