爬取豆瓣图书标签: 中国文学这里面的图书信息内容并且保存到xlsx文件中

import requests
from bs4 import BeautifulSoup
import xlsxwriter
import os
def getHtml(url):
    headers={'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36"}
    data=requests.get(url,headers=headers)
    soup=BeautifulSoup(data.text,'lxml')
    dou_books=soup.find('ul',{'class':'subject-list'})
    books=dou_books.find_all('li')
    return books

def getTitles(books):
    titles=[]
    for book in books:
        title=book.find_all('a')[1].get_text()
        title=title.replace('\n','').replace(' ','')
        titles.append(title)
    return titles

def getImg_urls(books):
    img_urls=[]
    for book in books:
        img_url=book.find_all('a')[0].find('img').get('src')
        img_urls.append(img_url)
    return img_urls

def getAuthors(books):
    authors=[]
    for book in books:
        author=book.find('div',{'class':'pub'}).get_text()
        author=author.replace('\n','').replace(' ','')
        authors.append(author)
    return authors

def getRatings(books):
    ratings=[]
    for book in books:
        rat=book.find('div',{'class':'star clearfix'})
        rating=rat.find('span',{'class':'rating_nums'}).get_text()
        rating=rating.replace('\n','').replace(' ','')
        ratings.append(rating)
    return ratings

if __name__=='__main__':
    titles=[]
    img_urls=[]
    authors=[]
    ratings=[]
    details=[]
    if '中国文学' not in os.listdir():
        os.mkdir('中国文学')
    os.chdir('中国文学')
    start_url='https://book.douban.com/tag/中国文学?'
    depth=int(input())#输入要爬取的页数
    for i in range(depth):
        try:
            url=start_url+'start='+str(20*i)
            books=getHtml(url)
            title=getTitles(books)
            titles+=title
            img_url=getImg_urls(books)
            img_urls+=img_url
            author=getAuthors(books)
            authors+=author
            rating=getRatings(books)
            ratings+=rating
        except requests.exceptions.ConnectionError as e:
            print("请求错误:,url:",url)
            print("错误详情:",e)
    workbook=xlsxwriter.Workbook('中国文学.xlsx', {'nan_inf_to_errors': True})
    worksheet=workbook.add_worksheet('中国文学')   
    nums=len(titles)  
    worksheet.write(0,0,'图片网址')
    worksheet.write(0,1,'图书标题')
    worksheet.write(0,2,'图书作者')
    worksheet.write(0,3,'图书评价')
    worksheet.set_column('A:A',55)
    worksheet.set_column('B:B',30)
    worksheet.set_column('C:C',60)
    worksheet.set_column('D:D',10)
    for i in range(1,nums+1):
        worksheet.write(i,0,img_urls[i-1])
        worksheet.write(i,1,titles[i-1])
        worksheet.write(i,2,authors[i-1])
        worksheet.write(i,3,ratings[i-1])
    workbook.close()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值