豆瓣
import re
import ssl
import urllib.request
ssl._create_default_https_context=ssl._create_unverified_context
class Douban:
def __init__(self):
self.queque=[]
self.user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14'
self.headers={'User_Agent':self.user_agent}
self.enable=False
self.pageIndex=0
self.pageStorage=[]
self.result=[]
def getpage(self):
url='https://book.douban.com/top250?start='+str(self.pageIndex)
req=urllib.request.Request(url,headers=self.headers)
with urllib.request.urlopen(req) as response:
pageCode=response.read().decode('utf-8')
p=re.compile(r'.*?title=.*?>(.*?)<.*?<p class="pl">(.*?)</p>.*?<span class="rating_nums">(.*?)</span>.*?\((.*?)\).*?<span class="inq">(.*?)</span>',re.S)
self.result=re.findall(p,pageCode)
book_list=[]
for item in self.result:
#book_list.append([item[0].strip(),item[1].strip(),item[2].strip(),item[3].strip(),item[4].strip()])
book_list.append([item[0].strip(),item[1].strip(),item[2].strip(),item[3].strip()])
for item in book_list:
print(item)
return pageCode
def start(self):
self.enable=True
while self.pageIndex<10:
print('{}~{}'.format(self.pageIndex*25,(self.pageIndex+1)*25-1))
pageCode=self.getpage()
self.pageIndex+=1
if __name__=='__main__':
douban_spider=Douban()
douban_spider.start()