import requests
from bs4 import BeautifulSoup
import xlsxwriter
import os
def getHtml(url):
headers={'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36"}
data=requests.get(url,headers=headers)
soup=BeautifulSoup(data.text,'lxml')
dou_books=soup.find('ul',{'class':'subject-list'})
books=dou_books.find_all('li')
return books
def getTitles(books):
titles=[]
for book in books:
title=book.find_all('a')[1].get_text()
title=title.replace('\n','').replace(' ','')
titles.append(title)
return titles
def getImg_urls(books):
img_urls=[]
for book in books:
img_url=book.find_all('a')[0].find('img').get('src')
img_urls.append(img_url)
return img_urls
def getAuthors(books):
authors=[]
for book in books:
author=book.find('div',{'class':'pub'}).get_text()
author=author.replace('\n','').replace(' ','')
authors.append(author)
return authors
def getRatings(books):
ratings=[]
for book in books:
rat=book.find('div',{'class':'star clearfix'})
rating=rat.find('span',{'class':'rating_nums'}).get_text()
rating=rating.replace('\n','').replace(' ','')
ratings.append(rating)
return ratings
if __name__=='__main__':
titles=[]
img_urls=[]
authors=[]
ratings=[]
details=[]
if '中国文学' not in os.listdir():
os.mkdir('中国文学')
os.chdir('中国文学')
start_url='https://book.douban.com/tag/中国文学?'
depth=int(input())#输入要爬取的页数
for i in range(depth):
try:
url=start_url+'start='+str(20*i)
books=getHtml(url)
title=getTitles(books)
titles+=title
img_url=getImg_urls(books)
img_urls+=img_url
author=getAuthors(books)
authors+=author
rating=getRatings(books)
ratings+=rating
except requests.exceptions.ConnectionError as e:
print("请求错误:,url:",url)
print("错误详情:",e)
workbook=xlsxwriter.Workbook('中国文学.xlsx', {'nan_inf_to_errors': True})
worksheet=workbook.add_worksheet('中国文学')
nums=len(titles)
worksheet.write(0,0,'图片网址')
worksheet.write(0,1,'图书标题')
worksheet.write(0,2,'图书作者')
worksheet.write(0,3,'图书评价')
worksheet.set_column('A:A',55)
worksheet.set_column('B:B',30)
worksheet.set_column('C:C',60)
worksheet.set_column('D:D',10)
for i in range(1,nums+1):
worksheet.write(i,0,img_urls[i-1])
worksheet.write(i,1,titles[i-1])
worksheet.write(i,2,authors[i-1])
worksheet.write(i,3,ratings[i-1])
workbook.close()