关于爬虫的教程已经有很多大神在做了,我看新手最适合去豆瓣爬虫,毕竟豆瓣的网站没有很多限制,不需要登录可以直接爬到各种电影图书的数据。
不过由于网上都是基于python2.x的版本进行的,我使用的是python3.5.2的版本,所以,还是要自己来慢慢摸索,写出一个爬虫。
实现功能:
1.爬取豆瓣上的分类内容(book, movie...)
2.将爬取的内容存到excel表格中。
用到的函数:
1. urllib(获取网页)
2. bs4(BeautifulSoup分析网页)
3. openpyxl(处理excel文件)
这是我写的第一个爬虫脚本,很简单,也参照了很多网上大神的代码,不过对爬虫有了一个初步的了解,也间接学习了一些html5和css的相关知识。
爬豆瓣有一个好处,因为豆瓣的网址非常规整,align的非常好。所以爬虫的时候,可以直接通过替换网址的一部分来分别搜索图书,电影,音乐。
目前有一个问题还没有很好的解决的就是openpyxl在写入excel文档的时候,开始写入的文档都是从第二个sheet开始写入,但是同样的问题在python2.7是不会出现的。
还没有找到很好的方法来解决这个问题,后续继续研究一下。
import imp
import sys
import time
import urllib
import urllib.request
import requests
import numpy as np
from bs4 import BeautifulSoup
from openpyxl import Workbook
imp.reload(sys)
#sys.setdefaultencoding('utf8')
#Some User Agents 用于将访问网页的request伪装成浏览器。
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]
#book_spider主要的爬虫过程,urllib.request来爬取网页,然后将网页解码成utf-8的格式,先要查看网页的编码格式,然后来根据编码格式来决定是否decode.
#用beautiful_soup来分析CSS网页,将对应的有用的信息抓取出来。存成list。
def book_spider(book_tag):
page_num=0;
book_list=[]
try_times=0
while(1):
#url='http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0' # For Test
url='http://www.douban.com/tag/'+urllib.request.quote(book_tag)+'/book?start='+str(page_num*15)
time.sleep(np.random.rand()*5)
typa=type(url)
#print("typa is",typa)
#print("the url is",url)
#Last Version
try:
req = urllib.request.Request(url, headers=hds[page_num%len(hds)])
source_code = urllib.request.urlopen(req).read()
plain_text1 = source_code.decode('utf-8')
plain_text=str(plain_text1)
#file_object = open("plain_text.txt",'a+',encoding='utf-8')
#file_object.write(plain_text)
#file_object.close()
#file_object = open("plain_text1.txt",'a+',encoding='utf-8')
#file_object.write(plain_text1)
#file_object.close()
# except (urllib.request.HTTPError, urllib.request.URLError) ,e:
except e:
print(e)
continue
##Previous Version, IP is easy to be Forbidden
#source_code = requests.get(url)
#plain_text = source_code.text
soup = BeautifulSoup(plain_text1)
list_soup = soup.find('div', {'class': 'mod book-list'})
try_times+=1;
if list_soup==None and try_times<200:
continue
elif list_soup==None or len(list_soup)<=1:
break # Break when no informatoin got after 200 times requesting
for book_info in list_soup.findAll('dd'):
title = book_info.find('a', {'class':'title'}).string.strip()
desc = book_info.find('div', {'class':'desc'}).string.strip()
desc_list = desc.split('/')
book_url = book_info.find('a', {'class':'title'}).get('href')
#file_objectn = open("plain_text5.txt",'w+',encoding='utf-8')
#file_objectn.write(book_url)
#file_objectn.close()
try:
author_info = '作者/译者: ' + '/'.join(desc_list[0:-3])
except:
author_info ='作者/译者: 暂无'
try:
pub_info = '出版信息: ' + '/'.join(desc_list[-3:])
except:
pub_info = '出版信息: 暂无'
try:
rating = book_info.find('span', {'class':'rating_nums'}).string.strip()
except:
rating='0.0'
try:
#people_num = book_info.findAll('span')[2].string.strip()
#print("yes")
people_num = get_people_num(book_url)
#print("yes1")
#print(people_num)
people_num = people_num.strip('人评价')
except:
people_num ='000'
#print(people_num)
book_list.append([title,rating,people_num,author_info,pub_info])
try_times=0 #set 0 when got valid information
page_num+=1
print('Downloading Information From Page %d' % page_num)
return book_list
def get_people_num(url):
#url='http://book.douban.com/subject/6082808/?from=tag_all' # For Test
page_num=0
try:
#print(url)
#print(page_num)
typ=type(url)
#print("url type is",typ)
req = urllib.request.Request(url, headers=hds[page_num%len(hds)])
#print(req)
source_code = urllib.request.urlopen(req).read()
plain_text=source_code.decode('utf-8')
#print(plain_text)
file_object = open("plain_text11.txt",'w+',encoding='utf-8')
file_object.write(plain_text)
file_object.close()
#plain_text=str(plain_text1)
#req = urllib2.Request(url, headers=hds[np.random.randint(0,len(hds))])
#source_code = urllib2.urlopen(req).read()
#print("thanks again")
except e:
#print("thanks again")
info=sys.exc_info()
#print(info[0],":",info[1])
#print(e)
soup = BeautifulSoup(plain_text)
#people_num=soup.find('div',{'class':'rating_sum'}).findAll('span')[1].string.strip()
people_num=soup.find('div',{'class':'rating_sum'})
print("1:",people_num)
people_num1=people_num.findAll('span')
print("2:",people_num1)
people_num2=people_num1[1].string.strip()
print("3:",people_num2)
return people_num
def do_spider(book_tag_lists):
book_lists=[]
for book_tag in book_tag_lists:
book_list=book_spider(book_tag)
book_list=sorted(book_list,key=lambda x:x[1],reverse=True)
book_lists.append(book_list)
return book_lists
def print_book_lists_excel(book_lists,book_tag_lists):
#wb=Workbook(optimized_write=True)
wb=Workbook()
ws=[]
for i in range(len(book_tag_lists)):
#ws.append(wb.create_sheet(title=book_tag_lists[i].decode())) #utf8->unicode
ws.append(wb.create_sheet(title=book_tag_lists[i])) #utf8->unicode
for i in range(len(book_tag_lists)):
ws[i].append(['序号','书名','评分','评价人数','作者','出版社'])
count=1
for bl in book_lists[i]:
ws[i].append([count,bl[0],float(bl[1]),int(bl[2]),bl[3],bl[4]])
#print(bl[2])
#typ = type(bl[2])
#print(typ)
count+=1
save_path='book_list'
for i in range(len(book_tag_lists)):
#save_path+=('-'+book_tag_lists[i].decode('utf-8'))
save_path+=('-'+book_tag_lists[i])
save_path+='.xlsx'
wb.save(save_path)
if __name__=='__main__':
book_tag_lists = ['python']
book_lists=do_spider(book_tag_lists)
print_book_lists_excel(book_lists,book_tag_lists)
#爬取出来的网页如下图:根据关键字python的排名,爬取了排名前五十的书籍,