1.爬取豆瓣电影top数据250条,存为excel文件
#获取豆瓣电影数据
import requests # 发送请求
from bs4 import BeautifulSoup # 解析网页
import pandas as pd # 存取csv
from time import sleep # 等待时间
movie_name = [] # 电影名称
movie_url = [] # 电影链接
movie_star = [] # 电影评分
movie_star_people = [] # 评分人数
movie_director = [] # 导演
movie_actor = [] # 主演
movie_year = [] # 上映年份
movie_country = [] # 国家
movie_type = [] # 类型
movie_comment = [] #短评
def get_movie_info(url, headers):
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, 'html.parser')
for movie in soup.select('.item'):
name = movie.select('.hd a')[0].text.replace('\n', '').split('/')[0] # 电影名称
movie_name.append(name)
url = movie.select('.hd a')[0]['href'] # 电影链接
movie_url.append(url)
star = movie.select('.rating_num')[0].text # 电影评分
movie_star.append(star)
star_people = movie.select('.star span')[3].text # 评分人数
star_people = star_people.strip().replace('人评价', '')
movie_star_people.append(star_people)
movie_infos = movie.select('.bd p')[0].text.strip() # 导演、主演、年份、国家、类型
director = movie_infos.split('\n')[0].split(' ')[0]
movie_director.append(director)
try: # 页面上既有导演,又有主演
actor = movie_infos.split('\n')[0].split(' ')[1]
movie_actor.append(actor)
except: # 页面上只有导演,没有主演
movie_actor.append(None)
if name == '大闹天宫 / 大闹天宫 上下集 / The Monkey King': # 大闹天宫,特殊处理
year0 = movie_infos.split('\n')[1].split('/')[0].strip()
year1 = movie_infos.split('\n')[1].split('/')[1].strip()
year2 = movie_infos.split('\n')[1].split('/')[2].strip()
year = year0 + '/' + year1 + '/' + year2
movie_year.append(year)
country = movie_infos.split('\n')[1].split('/')[3].strip()
movie_country.append(country)
type = movie_infos.split('\n')[1].split('/')[4].strip()
movie_type.append(type)
else: # 其他电影,正常处理
year = movie_infos.split('\n')[1].split('/')[0].strip()
movie_year.append(year)
country = movie_infos.split('\n')[1].split('/')[1].strip()
movie_country.append