今天写了一个爬取补天厂商列表的爬虫,好方便进行渗透测试
直接贴出代码:
import requests
from lxml import etree
import os
#引入模块 以dom-tree的方式浏览网页,注意模块要pip install安装才能引入
def Save_File(messageList):
path = '补天厂商列表'
if not os.path.exists(path):
os.mkdir(path)#若不存在这个文件夹则要创建
filepath = '厂商列表.txt'
new_path = os.path.join(path, filepath)
with open(new_path, 'a+', encoding = 'utf8') as f:#以追加的方法将列表写入.txt文件
for x in messageList:
f.write('%s %s\n'% (x[0][0], x[1][0]))#写入厂商名字和url
def load_message(page_message):
dom = etree.HTML(page_message)
i = 2
LM_messageList = []
while True:
index = str(i)
new_xpath = '//table/tr[' + index + ']/td[1]/a/text()'#使用正则匹配厂商名字
title = dom.xpath(new_xpath)
new_url_xpath = '//table/tr[' + index + ']/td[2]/text()'#使用正则匹配厂商URL
url = dom.xpath(new_url_xpath)
if not url:
url = ['URL丢失']#有的没有URL输出丢失
if not title:
break
LM_messageList.append((title, url))#将名称和URL都输入到列表当中
i += 1
return LM_messageList
def Spider(file_URL, last_message):
print(file_URL)
S_page_message = requests.get(file_URL).content.decode('utf8')#访问URL注意当前网页的编码
S_messageList = load_message(S_page_message)
a = S_messageList[0][0] == last_message[0]
if not S_messageList[0][0] == last_message:#判断厂商和URL是否对应
Save_File(S_messageList)
return S_messageList[0][0], a
if __name__ == '__main__':
new_url = 'https://butian.360.cn/company/lists'
t = Spider(new_url, ('a', 'b'))
i = 2
while True:
P_new_url = new_url +'/page/' + str(i)#循环到所有的page
t = Spider(P_new_url, t)
if t[1]:
break
i += 1
print('finish')#完成
到此补天厂商的列表都被抓取出来了