python实现网站TDK扫描-优快云博客

本文链接：https://blog.youkuaiyun.com/Gurs_HuaNian/article/details/145308351

import re
import urllib3
import os
import yagmail
import requests
import logging
from concurrent.futures import ThreadPoolExecutor
import time
import threading
import xlwt
import xlrd
import socket
from xlutils.copy import copy
from urllib3.exceptions import InsecureRequestWarning
 
# 禁用安全请求警告
urllib3.disable_warnings(InsecureRequestWarning)
 
# 获取状态码、标题
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.6613.85 Safari/537.36',
}
 
def get_ip(url):
    url = url.strip('\n').replace('http://', '').replace('https://', '')
    myaddr = socket.getaddrinfo(url, 'http')
    return myaddr[0][4][0]
 
def get_codetitle(url):
    code = "无法访问"
    title = " "
    resurl = " "
    description = " "
    keywords = " "
    try:
        res = requests.get(url, headers=header, verify=False, allow_redirects=True, timeout=(3, 12))
        res.encoding = res.apparent_encoding
        code = res.status_code
        title = re.findall(r"(?<=\<title\>)(?:.|\n)+?(?=\<)", res.text, re.IGNORECASE)[0].strip()
        description = re.findall(r"(?<=\<meta name=\"description\" content=\")(?:.|\n)+?(?=\" \/\>|\"\/\>)", res.text, re.IGNORECASE)[0].strip()
        keywords = re.findall(r"(?<=\<meta name=\"keywords\" content=\")(?:.|\n)+?(?=\" \/\>|\"\/\>)", res.text, re.IGNORECASE)[0].strip()
        resurl = res.url
    except requests.RequestException as error:
        logging.error('%s网址无效或者IP被封锁: %s', url, error)
 
    try:
        ip = get_ip(url)
    except socket.error as error:
        logging.error('获取IP失败: %s', error)
        ip = 'null'
 
    return resurl, code, title, description, keywords, ip
 
def write(url):
    codetitle = get_codetitle(url)
    resurl, code, title, description, keywords, ip = map(str, codetitle)
    logging.info('%s | %s | %s | %s', url, code, title, ip)
    with lock:
        with xlrd.open_workbook(os.path.join(path, savefilename + '.xls')) as word_book:
            sheets = word_book.sheet_names()
            work_sheet = word_book.sheet_by_name(sheets[0])
            old_rows = work_sheet.nrows
            new_work_book = copy(word_book)
            new_sheet = new_work_book.get_sheet(0)
            i = old_rows
            new_sheet.write(i, 0, url)
            new_sheet.write(i, 1, resurl)
            new_sheet.write(i, 2, code)
            new_sheet.write(i, 3, title)
            new_sheet.write(i, 4, description)
            new_sheet.write(i, 5, keywords)
            new_sheet.write(i, 6, ip)
            new_work_book.save(os.path.join(path, savefilename + '.xls'))
 
def process_urls(input_file, output_file):
    with open(input_file, "r") as f:
        lines = f.readlines()
 
    with open(output_file, "w") as f2:
        for line in lines:
            line = line.strip('\n').strip()
            if not line.startswith('http://') and not line.startswith('https://'):
                f2.write('http://' + line + '\n')
            else:
                f2.write(line + '\n')
 
def send_email(duration):
    try:
        yag = yagmail.SMTP(user=os.getenv("EMAIL_USER"), password=os.getenv("EMAIL_PASS"), host='smtp.qq.com', port=465)
        contents = [f'TDK获取时间：{duration}秒']
        subject = 'TDK获取完成通知'
        receiver = ["705276383@qq.com"]
        yag.send(to=receiver, subject=subject, contents=contents)
        yag.close()
    except Exception as error:
        logging.error('发送邮件失败: %s', error)
 
if __name__ == "__main__":
    n = 0
    path = os.getcwd()
    logging.captureWarnings(True)
    logging.basicConfig(level=logging.INFO)
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
    start = time.time()
    lock = threading.Lock()
    savefilename = time.strftime("%Y-%m-%d %H.%M.%S")
    myxls = xlwt.Workbook()
    sheet1 = myxls.add_sheet(u'title', cell_overwrite_ok=True)
    sheet1.write(0, 0, "源地址")
    sheet1.write(0, 1, "跳转地址")
    sheet1.write(0, 2, "状态码")
    sheet1.write(0, 3, "标题")
    sheet1.write(0, 4, "描述")
    sheet1.write(0, 5, "关键词")
    sheet1.write(0, 6, "IP")
    myxls.save(os.path.join(path, savefilename + '.xls'))
 
    process_urls(os.path.join(path, "url.txt"), os.path.join(path, "url-run.txt"))
 
    with open(os.path.join(path, 'url-run.txt'), 'r', encoding='utf-8') as f:
        urls_data = [data.strip().strip('\\') for data in f]
 
    with ThreadPoolExecutor(max_workers=100) as executor:
        for url in urls_data:
            executor.submit(write, url=url)
 
    end = time.time()
    logging.info("总耗时: %s 秒", end - start)
 
    send_email(end - start)

以上python代码是基于python3.13版，后续如有其他版本升级可自行调整。
注意事项：需要在代码同级目录下建一个url.txt文档，将需要扫描的地址填入，每个地址单独一行。扫描完之后会生成一个excel表文档，一般会在代码的同级目录下，以当前日期和时间命名。