使用python爬取豆瓣电影TOP250
# 爬取豆瓣电影TOP250
# -*- coding: utf-8 -*-
# @Author: lovenLiu
# @Date: 2019-04-03 09:03:09
# @Last Modified by: lovenLiu
# @Last Modified time: 2019-04-03 10:59:39
import os
import re
import time
import json
import requests
from bs4 import BeautifulSoup
# 爬取分页数据
def douban_page(page_url):
response = requests.get(page_url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
grid_view = soup.find(class_="grid_view")
grid_view_items = grid_view.find_all("li")
page_list = []
for item in grid_view_items:
# 大部分电影标题栏有两个title标签和一个other标签
# 但部分电影只有一个title标签,比如top2《霸王别姬》
titles = item.find_all(class_="title")
name = titles[0].get_text().strip() # 电影名称
if len(titles) >= 2:
alias = titles[1].get_text().strip(" / ") # 电影别名
else:
alias = ""
# 提取电影年份、地区、类型等信息
p_elem = item.find(class_="bd").find("p", class_="")
p_strs = re.findall(r'<