豆瓣Top250电影爬虫实战-优快云博客

本文链接：https://blog.youkuaiyun.com/m0_60255954/article/details/128195004

豆瓣电影T250
使用SQList数据库、xpath和正则表达式
PS：xpath的string(.)方法取出嵌套节点内的文本内容

import requests
from lxml import etree
import re
import sqlite3
import os
import time
# 定义请求头
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0'}
# 抓取豆瓣电影top250页面
def get_movie_url(url):
    html=requests.get(url,headers=headers)
    selector=etree.HTML(html.text)
    # 使用xpath提取当前页面中所有的电影url
    movie_hrefs=selector.xpath('//div[@class="pic"]/a/@href')
    for movie_href in movie_hrefs:
        get_movie_info(movie_href)

def get_movie_info(url):
    html=requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    try:
        # 提取电影名称
        name=selector.xpath('//div[@id="content"]/h1/span[1]/text()')[0]
        # 提取导演
        director = selector.xpath('//div[@id="info"]/span[1]/span[2]')[0].xpath('string(.)')
        # 提取主演
        actors=selector.xpath('//div[@id="info"]/span[3]/span[2]')[0]
        actor=actors.xpath('string(.)')
        # 提取类型
        styles=re.findall('<span property="v:genre">(.*?)</span>',html.text)
        style=''
        for stylei in styles:
            style+=stylei+'/'
        style=style.strip('/')
        # 提取制片国家或地区
        country=re.findall('<span class="pl">制片国家/地区:</span>(.*?)<br/>',html.text,re.S)[0]
        # 获取上映时间
        release_times=re.findall(r'<span property="v:initialReleaseDate" content=".*?">(.*?)</span>',html.text,re.S)
        release_time=''
        for release_timei in release_times:
            release_time+=release_timei+'/'
        release_time=release_time.strip('/')
        # 提取片长
        # time=re.findall('',html.text,re.S)
        time=re.findall('<span property="v:runtime" content=".*?">(.*?)<br/>',html.text,re.S)[0]
        if time.find('</span>') != -1:
            time=time.replace('</span>','')
        # 提取豆瓣评分
        score=re.findall('<strong class="ll rating_num" property="v:average">(.*?)</strong>',html.text)[0]
        global id
        # 数据库记录的索引
        id+=1
        # 保存到SQList数据库中的记录
        movie=(id,str(name),str(director),str(actor),str(style),str(country),str(release_time),str(time),score)
        print(movie)
        # 将当时的电影的信息保存到数据库中
        cursor.execute(
            "insert into movies(id,name,director,actor,style,country,release_time,time,score) values(?,?,?,?,?,?,?,?,?)",movie)
        # 提交，将数据保存到SQList数据库里
        conn.commit()
    except IndexError:
        pass

if __name__ == '__main__':
    id=0
    dbPath='movie.sqlite'
    if os.path.exists(dbPath):
        os.remove(dbPath)
    # 创建SQList数据库
    conn=sqlite3.connect(dbPath)
    # 获取sqlite3.Cursor对象
    cursor=conn.cursor()
    # 创建persons表
    cursor.execute("""CREATE TABLE movies
    (id INT NOT NULL,
    name CHAR(50) NOT NULL,
    director CHAR(50) NOT NULL,
    actor CHAR(50) NOT NULL,
    style CHAR(50) NOT NULL,
    country CHAR(50) NOT NULL,
    release_time CHAR(50) NOT NULL,
    time CHAR(50) NOT NULL,
    score REAL NOT NULL
    );""")
    # 提交后，创建movies表
    conn.commit()
    print('创建数据库成功')
    urls=['https://movie.douban.com/top250?start={}&filter='.format(str(i)) for i in range(0,250,25)]
    # print(urls)
    for url in urls:
        get_movie_url(url)
        time.sleep(1)
    conn.close()

结果：

爬虫笔记_10