先贴出代码:
import time
from random import random
import requests
from bs4 import BeautifulSoup
# http://steamworkshop.download/latest/rowstart/0/
def main():
count = 1
keywords = ["Genshin"] # 替换为你要筛选的关键词,比如说r18,xray,好了,不说了
key = "Download: "
for i in range(40):
url = "http://steamworkshop.download/latest/rowstart/"+str(50*i)+"/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
# 保存数据open函数
# 保存txt文件路径
# 'a'模式可以保证只是添加,append
with open('color.txt','a',encoding='utf-8') as f:
for link in soup.find_all("a"):
if key in link.text:
for keyword in keywords:
if keyword in link.text:
print(link.get("href"),link.text)
f.write(link.get("href") + "\t" + link.text.lstrip(key) + '\n') # 写入数据,文件保存在上面指定的目录,加\n为了换行更方便阅读
break
print("休息" + str(count) + "次")
count = count + 1
time.sleep(random()*2)
if __name__ == '__main__':
s = time.time()
main()
e = time.time()
print('总用时:',e-s)
原理就是爬取网页上所有超链接,在对超链接进行关键词筛选,并保存在文件中。
代码可直接运行。