from bs4 import BeautifulSoup import requests,sys class downloader(object): #定义一个downloader类 def __init__(self): #当前类的实例 self.server = 'http://www.biqukan.com/' self.target = 'http://www.biqukan.com/39_39591/' self.names = [] #章节名 self.urls = [] #链接名 self.nums = 0 #章节数 def get_download_url(self): #获取下载链接 req = requests.get(url = self.target) html = req.text #为什么有.text?? div_df = BeautifulSoup(html) div = div_df.find_all('div',class_ = 'listmain') a_bf = BeautifulSoup(str(div[0])) #为什么要用str()?? a = a_bf.find_all('a') self.nums = len(a[12:]) for each in a[12:]: self.names.append(each.string) self.urls.append(self.server + each.get('href')) def get_contents(self,target): #获取章节内容 req = requests.get(url = target) html = req.text bf = BeautifulSoup(html) texts = bf.find_all('div',class_ = 'showtxt') texts = texts[0].text.replace('\xa0' * 8, '\n\n') return texts def writer(self,name,path,text): #将爬取的文章内容写入文件 write_flag = True with open(path,'a',encoding = 'utf-8') as f: f.write(name + '\n') #章节名 f.writelines(text) #章节内容 f.write('\n\n') if __name__ == '__main__': d1 = downloader() #调用类 d1.get_download_url() #调用类中的方法 print('《终鄢》开始下载:') for i in range(d1.nums): d1.writer(d1.names[i],'终鄢.txt',d1.get_contents(d1.urls[i])) sys.stdout.write(' 已下载:%.3f%%' % float(i/d1.nums) + '\r') sys.stdout.flush() print('《终鄢》下载完成')
爬小说
最新推荐文章于 2023-11-05 12:02:57 发布