把dir改成书荒阁某本小说目录,name改为保存文件名字,就可以爬下来了
from bs4 import BeautifulSoup
import requests
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
def download(url, path):
res = requests.get(url, headers = headers)
res.encoding = 'gbk'
bs = BeautifulSoup(res.text, 'html.parser')#.replace(' ','')
tag = bs.find('div', {'id':'content'})
with open(path, 'a', encoding = 'utf-8') as f:
f.write('\n\n' + bs.title.get_text() + '\n\n')
f.write(tag.get_text())
def getdir(url, baseurl):
res = requests.get(url, headers = headers)
bs = BeautifulSoup(res.text, 'html.parser')
tags = bs.find('div', {'id':'list'}).findAll(['dd', 'dt'])
start = 1
while True:
if tags[start].name == 'dt':
break
start+=1
for i in range(start+1, len(tags)):
yield baseurl+tags[i].a['href']
#dir=input()
#name=input()
dir='https://www.shuhuangge.org/0_71/'
baseurl='https://www.shuhuangge.org'
name='我欲封天'
for url in getdir(dir, baseurl):
download(url,name + '.txt')
print(url)
#time.sleep(1)