PROJECT 1: 豆瓣爬虫

本文介绍了一个简单的Python爬虫项目,该爬虫用于抓取豆瓣网站上特定标签的图书信息,并将其整理后保存到Excel文件中。爬虫使用了urllib和BeautifulSoup库来抓取和解析网页数据。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

关于爬虫的教程已经有很多大神在做了,我看新手最适合去豆瓣爬虫,毕竟豆瓣的网站没有很多限制,不需要登录可以直接爬到各种电影图书的数据。

不过由于网上都是基于python2.x的版本进行的,我使用的是python3.5.2的版本,所以,还是要自己来慢慢摸索,写出一个爬虫。

实现功能:

1.爬取豆瓣上的分类内容(book, movie...)

2.将爬取的内容存到excel表格中。

用到的函数:

1. urllib(获取网页)

2. bs4(BeautifulSoup分析网页)

3. openpyxl(处理excel文件)

这是我写的第一个爬虫脚本,很简单,也参照了很多网上大神的代码,不过对爬虫有了一个初步的了解,也间接学习了一些html5和css的相关知识。

爬豆瓣有一个好处,因为豆瓣的网址非常规整,align的非常好。所以爬虫的时候,可以直接通过替换网址的一部分来分别搜索图书,电影,音乐。

目前有一个问题还没有很好的解决的就是openpyxl在写入excel文档的时候,开始写入的文档都是从第二个sheet开始写入,但是同样的问题在python2.7是不会出现的。

还没有找到很好的方法来解决这个问题,后续继续研究一下。

import imp
import sys
import time
import urllib
import urllib.request
import requests
import numpy as np
from bs4 import BeautifulSoup
from openpyxl import Workbook

imp.reload(sys)
#sys.setdefaultencoding('utf8')




#Some User Agents 用于将访问网页的request伪装成浏览器。
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]

#book_spider主要的爬虫过程,urllib.request来爬取网页,然后将网页解码成utf-8的格式,先要查看网页的编码格式,然后来根据编码格式来决定是否decode.
#用beautiful_soup来分析CSS网页,将对应的有用的信息抓取出来。存成list。

def book_spider(book_tag):
    page_num=0;
    book_list=[]
    try_times=0
    
    while(1):
        #url='http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0' # For Test
        url='http://www.douban.com/tag/'+urllib.request.quote(book_tag)+'/book?start='+str(page_num*15)
        time.sleep(np.random.rand()*5)
        typa=type(url)
        #print("typa is",typa)
        #print("the url is",url)
        
        #Last Version
        try:
            req = urllib.request.Request(url, headers=hds[page_num%len(hds)])
            source_code = urllib.request.urlopen(req).read()
            plain_text1 = source_code.decode('utf-8')
            plain_text=str(plain_text1)
            #file_object = open("plain_text.txt",'a+',encoding='utf-8')
            #file_object.write(plain_text)
            #file_object.close()
            #file_object = open("plain_text1.txt",'a+',encoding='utf-8')
            #file_object.write(plain_text1)
            #file_object.close()
        # except (urllib.request.HTTPError, urllib.request.URLError) ,e:
        except e:
            print(e)
            continue
  
        ##Previous Version, IP is easy to be Forbidden
        #source_code = requests.get(url) 
        #plain_text = source_code.text  
        
        soup = BeautifulSoup(plain_text1)

        list_soup = soup.find('div', {'class': 'mod book-list'})

        try_times+=1;
        if list_soup==None and try_times<200:
            continue
        elif list_soup==None or len(list_soup)<=1:
            break # Break when no informatoin got after 200 times requesting
        
        for book_info in list_soup.findAll('dd'):
            title = book_info.find('a', {'class':'title'}).string.strip()
            desc = book_info.find('div', {'class':'desc'}).string.strip()
            desc_list = desc.split('/')
            book_url = book_info.find('a', {'class':'title'}).get('href')
            #file_objectn = open("plain_text5.txt",'w+',encoding='utf-8')
            #file_objectn.write(book_url)
            #file_objectn.close()
            
            try:
                author_info = '作者/译者: ' + '/'.join(desc_list[0:-3])
            except:
                author_info ='作者/译者: 暂无'
            try:
                pub_info = '出版信息: ' + '/'.join(desc_list[-3:])
            except:
                pub_info = '出版信息: 暂无'
            try:
                rating = book_info.find('span', {'class':'rating_nums'}).string.strip()
            except:
                rating='0.0'
            try:
                #people_num = book_info.findAll('span')[2].string.strip()
                #print("yes")
                people_num = get_people_num(book_url)
                #print("yes1")
                #print(people_num)
                people_num = people_num.strip('人评价')
                
            except:
                people_num ='000'
                #print(people_num)
            
            book_list.append([title,rating,people_num,author_info,pub_info])
            try_times=0 #set 0 when got valid information
        page_num+=1
        print('Downloading Information From Page %d' % page_num)
    return book_list


def get_people_num(url):
    #url='http://book.douban.com/subject/6082808/?from=tag_all' # For Test
    page_num=0
    try:
        #print(url)
        #print(page_num)
        typ=type(url)
        #print("url type is",typ)
        req = urllib.request.Request(url, headers=hds[page_num%len(hds)])
        #print(req)
        source_code = urllib.request.urlopen(req).read()
        plain_text=source_code.decode('utf-8')
        #print(plain_text)
        file_object = open("plain_text11.txt",'w+',encoding='utf-8')
        file_object.write(plain_text)
        file_object.close()
        #plain_text=str(plain_text1)
        #req = urllib2.Request(url, headers=hds[np.random.randint(0,len(hds))])
        #source_code = urllib2.urlopen(req).read()
        #print("thanks again")
    except  e:
        #print("thanks again")
        info=sys.exc_info()  
        #print(info[0],":",info[1])
        #print(e)
    soup = BeautifulSoup(plain_text)
    #people_num=soup.find('div',{'class':'rating_sum'}).findAll('span')[1].string.strip()
    people_num=soup.find('div',{'class':'rating_sum'})
    print("1:",people_num)
    people_num1=people_num.findAll('span')
    print("2:",people_num1)
    people_num2=people_num1[1].string.strip()
    print("3:",people_num2)
    return people_num


def do_spider(book_tag_lists):
    book_lists=[]
    for book_tag in book_tag_lists:
        book_list=book_spider(book_tag)
        book_list=sorted(book_list,key=lambda x:x[1],reverse=True)
        book_lists.append(book_list)
    return book_lists


def print_book_lists_excel(book_lists,book_tag_lists):
    #wb=Workbook(optimized_write=True)
    wb=Workbook()
    ws=[]
    for i in range(len(book_tag_lists)):
        #ws.append(wb.create_sheet(title=book_tag_lists[i].decode())) #utf8->unicode
        ws.append(wb.create_sheet(title=book_tag_lists[i])) #utf8->unicode
    for i in range(len(book_tag_lists)): 
        ws[i].append(['序号','书名','评分','评价人数','作者','出版社'])
        count=1
        for bl in book_lists[i]:
            ws[i].append([count,bl[0],float(bl[1]),int(bl[2]),bl[3],bl[4]])
            #print(bl[2])
            #typ = type(bl[2])
            #print(typ)
            count+=1
    save_path='book_list'
    for i in range(len(book_tag_lists)):
        #save_path+=('-'+book_tag_lists[i].decode('utf-8'))
        save_path+=('-'+book_tag_lists[i])
    save_path+='.xlsx'
    wb.save(save_path)




if __name__=='__main__':
    book_tag_lists = ['python']
    book_lists=do_spider(book_tag_lists)
    print_book_lists_excel(book_lists,book_tag_lists)
#爬取出来的网页如下图:根据关键字python的排名,爬取了排名前五十的书籍,





















                
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值