一、前言
刷SRC时,必不可少的就是信息收集环节,要查看域名的权重来判断是否符合平台提交规则,一个一个手动复制域名去查询很是麻烦,所以通过 python爬虫的形式,实现单个查询与批量查询的脚本工具,这样能大大简化我们工作量,提高效率。
二、代码实现
1. 安装所需要的库
pip install requests
pip install lxml
2. 具体实现
import requests
import urllib3
import argparse
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
# 解决requests请求出现的InsecureRequestWarning错误
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def query(url):
url = "https://www.aizhan.com/cha/{}/".format(url)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8,vi;q=0.7',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': 'https://www.aizhan.com/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'linux',
}
response = requests.get(url=url, headers=headers)
lxml_tree = etree.HTML(response.text)
href_name = lxml_tree.xpath(
'//div[@id="webpage_title"]//text()')
print("\n-> Title信息: {0}".format("".join(href_name)))
br = lxml_tree.xpath(
'//a[@id="baidurank_br"]//img//@alt')
mbr = lxml_tree.xpath(
'//a[@id="baidurank_mbr"]//img//@alt')
pr = lxml_tree.xpath(
'//a[@id="360_pr"]//img//@alt')
sm_pr = lxml_tree.xpath(
'//a[@id="sm_pr"]//img//@alt')
sogou_pr = lxml_tree.xpath(
'//a[@id="sogou_pr"]//img//@alt')
google_pr = lxml_tree.xpath(
'//a[@id="google_pr"]//img//@alt')
print("[+] 综合权重: \n 百度权重: {0}\t移动权重:{1}\t360权重:{2}\t神马权重:{3}\t搜狗权重:{4}\t谷歌PR:{5}".format("".join(
br), "".join(mbr), "".join(pr), "".join(sm_pr), "".join(sogou_pr), "".join(google_pr)))
icp = lxml_tree.xpath(
'//ul[@id="icp"]//text()')
print("[+] 备案信息: \n", repr(" ".join(icp)).replace(
"\\n", "").replace("\\t", "").replace("'", ""))
if __name__ == "__main__":
example_text = """
python 1.py -u qq.com
python 1.py -f 1.txt
"""
try:
parser = argparse.ArgumentParser(
description=example_text, formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument("-u", "--url", required=False)
parser.add_argument("-f", "--files", required=False)
args = parser.parse_args()
url = args.url
files = args.files
if url:
query(url=url)
else:
count = 0
with open(files, "r", encoding="utf-8") as f:
# 创建最大线程数的线程池
with ThreadPoolExecutor(10) as threadPool:
for url in f:
try:
threadPool.submit(query, url.replace("\n", ""))
count += 1
except Exception as e:
print("[-] error: ",e)
continue
print("\ntotle: [{}]".format(count))
except:
pass
三、实现效果
1. 单条查询
2. 多条查询
四、总结
代码实现思路是很简单的,用起来效果也很不错,当遇到重复工作时,就需要思考是否能用所学知识来,来简化我们的工作量,也可以参考别人的思路来实现自己的工具。微信公众号搜索艺说IT学习更多内容。对你有用的话请一键三连,感谢。