网页下载器
import requests
from http import cookiejar
import urllib
class HtmlDownloader():
def cookie():
with open('cookie.txt','r') as f:
cookies={}
for line in f.read().split(';'):
name,value=line.strip().split('=',1)
cookies[name]=value
return cookies
def download(self,url):
if url is None:
return None
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331',
'Referer': r'http://movie.mtime.com',
'Connection': 'keep-alive'
}
cookie=cookiejar.CookieJar()
opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
response=opener.open(url)
r=requests.get(url,headers=headers,cookies=cookie)
if r.status_code==200:
r.encoding='utf-8'
return r.text
return None
网页解析器
import re
import json
class HtmlParser():
def parser_url(self,page_url,response):
pattern=re.compile(r'(http://movie.mtime.com/(\d+)/)')
urls=pattern.findall(response)
if urls!=None:
return list(set(urls))
else:
return '没有链接了!'
def parser_json(self,page_url,response):
pattern=re.compile(r'=(.*?);')
result=pattern.findall(response)[0]
if result!=None:
value=json.loads(result)
try:
isRelease=value.get('value')
except Exception as e:
print(e)
return None
if isRelease:
if value.get('value').get('hotValue')==None:
return self._parser_release(page_url,value)
else:
return self._parser_no_release(page_url,value,isRelease=2)
else:
return self._parser_no_release(page_url,value)
def _parser_release(self,page_url,value):
try:
isRelease=1
movieRating=value.get('value').get('movieRating')
boxOffice=value.get('value').get('boxOffice')
movieTitle=value.get('value').get('movieTitle')
RPictureFinal=movieRating.get('RPictureFinal')
RStoryFinal=movieRating.get('RStoryFinal')
RDirectorFinal=movieRating.get('RDirectorFinal')
ROtherFinal=movieRating.get('ROtherFinal')
RatingFinal=movieRating.get('RatingFinal')
MovieId=movieRating.get('MovieId')
Usercount=movieRating.get('Usercount')
AttitudeCount=movieRating.get('AttitudeCount')
if boxOffice!=None:
TotalBoxOffice=boxOffice.get('TotalBoxOffice')
TotalBoxOfficeUnit=boxOffice.get('TotalBoxOfficeUnit')
TodayBoxOffice=boxOffice.get('TodayBoxOffice')
TodayBoxOfficeUnit=boxOffice.get('TodayBoxOfficeUnit')
ShowDays=boxOffice.get('ShowDays')
try:
Rank=boxOffice.get('Rank')
except Exception as e:
Rank=0
return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,
TotalBoxOffice+TotalBoxOfficeUnit,TodayBoxOffice+TodayBoxOfficeUnit,Rank,ShowDays,isRelease)
else:
Rank=0
return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,u'无',u'无',Rank,0,isRelease)
except Exception as e:
print(e,page_url,value)
return None
def _parser_no_release(self,page_url,value,isRelease=0):
try:
movieRating=value.get('value').get('movieRating')
movieTitle=value.get('value').get('movieTitle')
RPictureFinal=movieRating.get('RPictureFinal')
RStoryFinal=movieRating.get('RStoryFinal')
RDirectorFinal=movieRating.get('RDirectorFinal')
ROtherFinal=movieRating.get('ROtherFinal')
RatingFinal=movieRating.get('RatingFinal')
MovieId=movieRating.get('MovieId')
Usercount=movieRating.get('Usercount')
AttitudeCount=movieRating.get('AttitudeCount')
try:
Rank=value.get('value').get('hotValue').get('Ranking')
except Exception as e:
Rank=0
return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,u'无',u'无',Rank,0,isRelease)
except Exception as e:
print(e,page_url,value)
return None
##数据存储器
import sqlite3
class DataOutput():
def __init__(self):
self.cx=sqlite3.connect('/home/as/test.db')
self.create_table('MTime')
self.datas=[]
def create_table(self,table_name):
values='''
id integer primary key,
MovieId integer,
MovieTitle varchar(40) NOT NULL,
RatingFinal REAL NOT NULL DEFAULT 0.0,
ROtherFinal REAL NOT NULL DEFAULT 0.0,
RPictureFinal REAL NOT NULL DEFAULT 0.0,
RDirectorFinal REAL NOT NULL DEFAULT 0.0,
RStoryFinal REAL NOT NULL DEFAULT 0.0,
Usercount integer NOT NULL DEFAULT 0,
AttitudeCount integer NOT NULL DEFAULT 0,
TotalBoxOffice varchat(20) NOT NULL,
TodayBoxOffice varchat(20) NOT NULL,
Rank integer NOT NULL DEFAULT 0,
ShowDays integer NOT NULL DEFAULT 0,
isRelease integer NOT NULL
'''
self.cx.execute('CREATE TABLE IF NOT EXISTS %s ( %s ) '%(table_name, values))
def store_data(self,data):
if data is None:
return
self.datas.append(data)
if len(self.datas)>10:
self.output_db('MTime')
def output_db(self,table_name):
for data in self.datas:
self.cx.execute('INSERT INTO %s (MovieId,MovieTitle,RatingFinal,'
'ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,'
'Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,'
'Rank,ShowDays,isRelease) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
''%table_name,data)
self.datas.remove(data)
self.cx.commit()
def output_end(self):
if len(self.datas)>0:
self.output_db('MTime')
self.cx.close()
##爬虫调度器
import time
class SpiderMan():
def __init__(self):
self.downloader=HtmlDownloader()
self.parser=HtmlParser()
self.output=DataOutput()
def crawl(self,root_url):
content=self.downloader.download(root_url)
urls=self.parser.parser_url(root_url,content)
for url in urls:
try:
t=time.strftime('%Y%-m%-d%H%M%S3282',time.localtime())
rank_url='http://service.library.mtime.com/Movie.api'\
'?Ajax_CallBack=true'\
'&Ajax_CallBackType=Mtime.Library.Services'\
'&Ajax_CallBackMethod=GetMovieOverviewRating'\
'&Ajax_CrossDomain=1'\
'&Ajax_RequestUrl=%s'\
'&t=%s'\
'&Ajax_CallBackArgument0=%s'%(url[0],t,url[1])
rank_content=self.downloader.download(rank_url)
data=self.parser.parser_json(rank_url,rank_content)
self.output.store_data(data)
except Exception as e:
print(e)
spider=SpiderMan()
spider.crawl('http://theater.mtime.com/China_Beijing/')
更多爬虫实例请见 https://blog.youkuaiyun.com/weixin_39777626/article/details/81564819