豆瓣电影T250
使用SQList数据库、xpath和正则表达式
PS:xpath的string(.)方法取出嵌套节点内的文本内容
import requests
from lxml import etree
import re
import sqlite3
import os
import time
# 定义请求头
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0'}
# 抓取豆瓣电影top250页面
def get_movie_url(url):
html=requests.get(url,headers=headers)
selector=etree.HTML(html.text)
# 使用xpath提取当前页面中所有的电影url
movie_hrefs=selector.xpath('//div[@class="pic"]/a/@href')
for movie_href in movie_hrefs:
get_movie_info(movie_href)
def get_movie_info(url):
html=requests.get(url,headers=headers)
selector = etree.HTML(html.text)
try:
# 提取电影名称
name=selector.xpath('//div[@id="content"]/h1/span[1]/text()')[0]
# 提取导演
director = selector.xpath('//div[@id="info"]/span[1]/span[2]')[0].xpath('string(.)')
# 提取主演
actors=selector.xpath('//div[@id="info"]/span[3]/span[2]')[0]
actor=actors.xpath('string(.)')
# 提取类型
styles=re.findall('<span property="v:genre">(.*?)</span>',html.text)
style=''
for stylei in styles:
style+=stylei+'/'
style=style.strip('/')
# 提取制片国家或地区
country=re.findall('<span class="pl">制片国家/地区:</span>(.*?)<br/>',html.text,re.S)[0]
# 获取上映时间
release_times=re.findall(r'<span property="v:initialReleaseDate" content=".*?">(.*?)</span>',html.text,re.S)
release_time=''
for release_timei in release_times:
release_time+=release_timei+'/'
release_time=release_time.strip('/')
# 提取片长
# time=re.findall('',html.text,re.S)
time=re.findall('<span property="v:runtime" content=".*?">(.*?)<br/>',html.text,re.S)[0]
if time.find('</span>') != -1:
time=time.replace('</span>','')
# 提取豆瓣评分
score=re.findall('<strong class="ll rating_num" property="v:average">(.*?)</strong>',html.text)[0]
global id
# 数据库记录的索引
id+=1
# 保存到SQList数据库中的记录
movie=(id,str(name),str(director),str(actor),str(style),str(country),str(release_time),str(time),score)
print(movie)
# 将当时的电影的信息保存到数据库中
cursor.execute(
"insert into movies(id,name,director,actor,style,country,release_time,time,score) values(?,?,?,?,?,?,?,?,?)",movie)
# 提交,将数据保存到SQList数据库里
conn.commit()
except IndexError:
pass
if __name__ == '__main__':
id=0
dbPath='movie.sqlite'
if os.path.exists(dbPath):
os.remove(dbPath)
# 创建SQList数据库
conn=sqlite3.connect(dbPath)
# 获取sqlite3.Cursor对象
cursor=conn.cursor()
# 创建persons表
cursor.execute("""CREATE TABLE movies
(id INT NOT NULL,
name CHAR(50) NOT NULL,
director CHAR(50) NOT NULL,
actor CHAR(50) NOT NULL,
style CHAR(50) NOT NULL,
country CHAR(50) NOT NULL,
release_time CHAR(50) NOT NULL,
time CHAR(50) NOT NULL,
score REAL NOT NULL
);""")
# 提交后,创建movies表
conn.commit()
print('创建数据库成功')
urls=['https://movie.douban.com/top250?start={}&filter='.format(str(i)) for i in range(0,250,25)]
# print(urls)
for url in urls:
get_movie_url(url)
time.sleep(1)
conn.close()
结果: