爬取影评信息

最新推荐文章于 2023-09-25 21:26:16 发布

原创最新推荐文章于 2023-09-25 21:26:16 发布 · 948 阅读

1 ·

CC 4.0 BY-SA版权

Python 同时被 2 个专栏收录

120 篇文章

订阅专栏

爬虫

39 篇文章

订阅专栏

本文介绍了一个使用Python实现的mtime电影信息爬虫项目，通过网页下载器获取网页内容，利用正则表达式解析URL和JSON数据，最后将解析的数据存储到SQLite数据库中。爬虫覆盖了北京地区的电影数据，包括电影评分、票房、排名等详细信息。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

网页下载器

import requests
from http import cookiejar
import urllib

class HtmlDownloader():
    def cookie():
        with open('cookie.txt','r') as f:
            cookies={}
            for line in f.read().split(';'):
                name,value=line.strip().split('=',1)
                cookies[name]=value 
            return cookies
        
    def download(self,url):
        if url is None:
            return None
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331',
            'Referer': r'http://movie.mtime.com',
            'Connection': 'keep-alive'
        }
        cookie=cookiejar.CookieJar()
        opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
        response=opener.open(url)
        r=requests.get(url,headers=headers,cookies=cookie)
        if r.status_code==200:
            r.encoding='utf-8'
            return r.text
        return None

网页解析器

import re
import json

class HtmlParser():
    def parser_url(self,page_url,response):
        pattern=re.compile(r'(http://movie.mtime.com/(\d+)/)')
        urls=pattern.findall(response)
        if urls!=None:
            return list(set(urls))
        else:
            return '没有链接了！'

    def parser_json(self,page_url,response):
        pattern=re.compile(r'=(.*?);')
        result=pattern.findall(response)[0]
        if result!=None:
            value=json.loads(result)
            try:
                isRelease=value.get('value')
            except Exception as e:
                print(e)
                return None
            if isRelease:
                if value.get('value').get('hotValue')==None:
                    return self._parser_release(page_url,value)
                else:
                    return self._parser_no_release(page_url,value,isRelease=2)
            else:
                return self._parser_no_release(page_url,value)

    def _parser_release(self,page_url,value):
        try:
            isRelease=1
            movieRating=value.get('value').get('movieRating')
            boxOffice=value.get('value').get('boxOffice')
            movieTitle=value.get('value').get('movieTitle')
            RPictureFinal=movieRating.get('RPictureFinal')
            RStoryFinal=movieRating.get('RStoryFinal')
            RDirectorFinal=movieRating.get('RDirectorFinal')
            ROtherFinal=movieRating.get('ROtherFinal')
            RatingFinal=movieRating.get('RatingFinal')

            MovieId=movieRating.get('MovieId')
            Usercount=movieRating.get('Usercount')
            AttitudeCount=movieRating.get('AttitudeCount')

            if boxOffice!=None:
                TotalBoxOffice=boxOffice.get('TotalBoxOffice')
                TotalBoxOfficeUnit=boxOffice.get('TotalBoxOfficeUnit')
                TodayBoxOffice=boxOffice.get('TodayBoxOffice')
                TodayBoxOfficeUnit=boxOffice.get('TodayBoxOfficeUnit')

                ShowDays=boxOffice.get('ShowDays')
                try:
                    Rank=boxOffice.get('Rank')
                except Exception as e:
                    Rank=0
                return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,
                       TotalBoxOffice+TotalBoxOfficeUnit,TodayBoxOffice+TodayBoxOfficeUnit,Rank,ShowDays,isRelease)
            
            else:
                Rank=0
                return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,u'无',u'无',Rank,0,isRelease)
        except Exception as e:
            print(e,page_url,value)
            return None

    def _parser_no_release(self,page_url,value,isRelease=0):
        try:
            movieRating=value.get('value').get('movieRating')
            movieTitle=value.get('value').get('movieTitle')

            RPictureFinal=movieRating.get('RPictureFinal')
            RStoryFinal=movieRating.get('RStoryFinal')
            RDirectorFinal=movieRating.get('RDirectorFinal')
            ROtherFinal=movieRating.get('ROtherFinal')
            RatingFinal=movieRating.get('RatingFinal')
            MovieId=movieRating.get('MovieId')
            Usercount=movieRating.get('Usercount')
            AttitudeCount=movieRating.get('AttitudeCount')
            try:
                Rank=value.get('value').get('hotValue').get('Ranking')
            except Exception as e:
                Rank=0
            return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,u'无',u'无',Rank,0,isRelease)
        except Exception as e:
            print(e,page_url,value)
            return None

##数据存储器

import sqlite3

class DataOutput():
    def __init__(self):
        self.cx=sqlite3.connect('/home/as/test.db')
        self.create_table('MTime')
        self.datas=[]
        
    def create_table(self,table_name):
        values='''
            id integer primary key,
            MovieId integer,
            MovieTitle varchar(40) NOT NULL,
            RatingFinal REAL NOT NULL DEFAULT 0.0,
            ROtherFinal REAL NOT NULL DEFAULT 0.0,
            RPictureFinal REAL NOT NULL DEFAULT 0.0,
            RDirectorFinal REAL NOT NULL DEFAULT 0.0,
            RStoryFinal REAL NOT NULL DEFAULT 0.0,
            Usercount integer NOT NULL DEFAULT 0,
            AttitudeCount integer NOT NULL DEFAULT 0,
            TotalBoxOffice varchat(20) NOT NULL,
            TodayBoxOffice varchat(20) NOT NULL,
            Rank integer NOT NULL DEFAULT 0,
            ShowDays integer NOT NULL DEFAULT 0,
            isRelease integer NOT NULL
            '''
        self.cx.execute('CREATE TABLE IF NOT EXISTS %s ( %s ) '%(table_name, values))
        
    def store_data(self,data):
        if data is None:
            return
        self.datas.append(data)
        if len(self.datas)>10:
            self.output_db('MTime')
        
    def output_db(self,table_name):
        for data in self.datas:
            self.cx.execute('INSERT INTO %s (MovieId,MovieTitle,RatingFinal,'
                           'ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,'
                           'Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,'
                           'Rank,ShowDays,isRelease) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
                          ''%table_name,data)
            self.datas.remove(data)
        self.cx.commit()
        
    def output_end(self):
        if len(self.datas)>0:
            self.output_db('MTime')
        self.cx.close()

##爬虫调度器

import time

class SpiderMan():
    def __init__(self):
        self.downloader=HtmlDownloader()
        self.parser=HtmlParser()
        self.output=DataOutput()
    def crawl(self,root_url):
        content=self.downloader.download(root_url)
        urls=self.parser.parser_url(root_url,content)
        for url in urls:
            try:
                t=time.strftime('%Y%-m%-d%H%M%S3282',time.localtime())
                rank_url='http://service.library.mtime.com/Movie.api'\
                '?Ajax_CallBack=true'\
                '&Ajax_CallBackType=Mtime.Library.Services'\
                '&Ajax_CallBackMethod=GetMovieOverviewRating'\
                '&Ajax_CrossDomain=1'\
                '&Ajax_RequestUrl=%s'\
                '&t=%s'\
                '&Ajax_CallBackArgument0=%s'%(url[0],t,url[1])
                rank_content=self.downloader.download(rank_url)
                data=self.parser.parser_json(rank_url,rank_content)
                self.output.store_data(data)
            except Exception as e:
                print(e)

spider=SpiderMan()
spider.crawl('http://theater.mtime.com/China_Beijing/')