#!/usr/bin/python3
# -*- coding:utf-8 -*-
# author: https://blog.youkuaiyun.com/zhongqi2513
# ====================================================
# 内容描述: 爬取豆瓣Top250电影
# ====================================================
from urllib.request import urlopen
from bs4 import BeautifulSoup
movie_url = [] ## 存储电影URL
movie_name = [] ## 存储电影的名称
movie_other = [] ## 存储电影的别名
playable_list = [] ## 是否可播放
daoyan_list = [] ## 存储导演
zhuyan_list = [] ## 存储主演
year_list = [] ## 存储电影上映年份
district_list = [] ## 存储地区
type_list = [] ## 存储电影类型
rating_list = [] ## 存储分数
count_list = [] ## 抓取评论数
quote_list = [] ## 存储核心宗旨
## 抓取一页数据, 数据都存放在上面这些列表中
def get_one_page(url):
response = urlopen(url)
bs = BeautifulSoup(response, "html.parser")
headdiv25 = bs.select("div.hd")
for div in headdiv25:
## 抓取电影的名称
titles = div.select("span.title")
title = "".join([t.text for t in titles])
movie_name.append(title.replace("\xa0", ""))
## 抓取电影别名
others = div.select("span.other")
other = "".join([o.text for o in others])
movie_other.append(other.replace("\xa0", "").replace(" ", "").strip("/"))
## 抓取是否可播放
plays = div.select("span.playable")
play = "".join([o.text for o in plays])
playable_list.append(play)
## 抓取电影 URL
url_a = div.select("a")
url = url_a[0].get("href")
movie_url.append(url)
bodydiv25 = bs.select("div.info div.bd")
for div in bodydiv25:
ps = div.select("p")
body = ps[0].text.strip().replace("\n", "")
twocontent = body.split(" ")
## 前半部分:导演和主演解析
threecontent = twocontent[0].split(":")
if len(threecontent) == 3:
daoyan_list.append(threecontent[1].strip("主演 ").replace("\xa0", ""))
zhuyan_list.append(threecontent[2].strip().replace("\xa0", ""))
else:
daoyan_list.append("~")
zhuyan_list.append("~")
## 后半部分:年份,地区和类型解析
if len(twocontent) <= 1:
year_list.append("~")
district_list.append("~")
type_list.append("~")
else:
# print(twocontent[1].strip().replace("\xa0", ""))
right_three_content = twocontent[1].strip().replace("\xa0", "").split("/")
year_list.append(right_three_content[0])
district_list.append(right_three_content[1])
type_list.append(right_three_content[2])
## 抓取电影核心思想
quote_span = div.select("p.quote span")
if len(quote_span) != 0:
quote_list.append(quote_span[0].text)
else:
quote_list.append("")
stardiv25 = bs.select("div.star")
for star in stardiv25:
## 抓取评分
rating_span25 = star.select("span.rating_num")
rating_list.append(rating_span25[0].text)
## 抓取评论数
count_span25 = star.select("span")
count_list.append(count_span25[3].text.strip("人评价"))
## 抓取整个豆瓣电影Top250的函数定义
def get_douban_top250_all():
## 每次抓取一页数据
for x in range(1, 11):
start = (x-1) * 25
url = "https://movie.douban.com/top250?start={0}&filter=".format(start)
get_one_page(url)
print("抓取第 {0} 页成功".format(x))
## 调度函数执行抓取
get_douban_top250_all()
# print(len(year_list))
"""
建表语句:
CREATE TABLE `douban_movie250` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`url` varchar(255) DEFAULT NULL,
`moviename` varchar(255) DEFAULT NULL,
`nickname` varchar(255) DEFAULT NULL,
`playable` varchar(255) DEFAULT NULL,
`daoyan` varchar(255) DEFAULT NULL,
`zhuyan` varchar(255) DEFAULT NULL,
`year` varchar(255) DEFAULT NULL,
`district` varchar(255) DEFAULT NULL,
`type` varchar(255) DEFAULT NULL,
`rate` varchar(255) DEFAULT NULL,
`count` int(11) DEFAULT NULL,
`quote` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
"""
print([len(movie_url), len(movie_name), len(movie_other), len(playable_list), len(daoyan_list), len(zhuyan_list),
len(year_list), len(district_list), len(type_list), len(rating_list), len(count_list), len(quote_list)])
import pymysql
conn = pymysql.connect(host="hadoop02", port=3306, user="root", password="root", database="bigdata", charset="utf8")
cusor = conn.cursor()
sql = "insert into douban_movie250 (url, moviename, nickname, playable, daoyan, zhuyan, year, district, type, rate, count, quote) values " \
"(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
## 输出结果数据
for index in range(len(movie_url)):
# print(movie_url[index], movie_name[index], movie_other[index], playable_list[index], body_list[index])
# print(body_list[index])
# print(daoyan_list[index], zhuyan_list[index], sep="****************")
# print(year_list[index], district_list[index], type_list[index], rating_list[index], count_list[index], sep="*********************")
# print(quote_list[index])
record = [movie_url[index], movie_name[index], movie_other[index], playable_list[index], daoyan_list[index], zhuyan_list[index],
year_list[index], district_list[index], type_list[index], rating_list[index], count_list[index], quote_list[index]]
cusor.execute(sql, record)
conn.commit()
cusor.close()
conn.close()
# -*- coding:utf-8 -*-
# author: https://blog.youkuaiyun.com/zhongqi2513
# ====================================================
# 内容描述: 爬取豆瓣Top250电影
# ====================================================
from urllib.request import urlopen
from bs4 import BeautifulSoup
movie_url = [] ## 存储电影URL
movie_name = [] ## 存储电影的名称
movie_other = [] ## 存储电影的别名
playable_list = [] ## 是否可播放
daoyan_list = [] ## 存储导演
zhuyan_list = [] ## 存储主演
year_list = [] ## 存储电影上映年份
district_list = [] ## 存储地区
type_list = [] ## 存储电影类型
rating_list = [] ## 存储分数
count_list = [] ## 抓取评论数
quote_list = [] ## 存储核心宗旨
## 抓取一页数据, 数据都存放在上面这些列表中
def get_one_page(url):
response = urlopen(url)
bs = BeautifulSoup(response, "html.parser")
headdiv25 = bs.select("div.hd")
for div in headdiv25:
## 抓取电影的名称
titles = div.select("span.title")
title = "".join([t.text for t in titles])
movie_name.append(title.replace("\xa0", ""))
## 抓取电影别名
others = div.select("span.other")
other = "".join([o.text for o in others])
movie_other.append(other.replace("\xa0", "").replace(" ", "").strip("/"))
## 抓取是否可播放
plays = div.select("span.playable")
play = "".join([o.text for o in plays])
playable_list.append(play)
## 抓取电影 URL
url_a = div.select("a")
url = url_a[0].get("href")
movie_url.append(url)
bodydiv25 = bs.select("div.info div.bd")
for div in bodydiv25:
ps = div.select("p")
body = ps[0].text.strip().replace("\n", "")
twocontent = body.split(" ")
## 前半部分:导演和主演解析
threecontent = twocontent[0].split(":")
if len(threecontent) == 3:
daoyan_list.append(threecontent[1].strip("主演 ").replace("\xa0", ""))
zhuyan_list.append(threecontent[2].strip().replace("\xa0", ""))
else:
daoyan_list.append("~")
zhuyan_list.append("~")
## 后半部分:年份,地区和类型解析
if len(twocontent) <= 1:
year_list.append("~")
district_list.append("~")
type_list.append("~")
else:
# print(twocontent[1].strip().replace("\xa0", ""))
right_three_content = twocontent[1].strip().replace("\xa0", "").split("/")
year_list.append(right_three_content[0])
district_list.append(right_three_content[1])
type_list.append(right_three_content[2])
## 抓取电影核心思想
quote_span = div.select("p.quote span")
if len(quote_span) != 0:
quote_list.append(quote_span[0].text)
else:
quote_list.append("")
stardiv25 = bs.select("div.star")
for star in stardiv25:
## 抓取评分
rating_span25 = star.select("span.rating_num")
rating_list.append(rating_span25[0].text)
## 抓取评论数
count_span25 = star.select("span")
count_list.append(count_span25[3].text.strip("人评价"))
## 抓取整个豆瓣电影Top250的函数定义
def get_douban_top250_all():
## 每次抓取一页数据
for x in range(1, 11):
start = (x-1) * 25
url = "https://movie.douban.com/top250?start={0}&filter=".format(start)
get_one_page(url)
print("抓取第 {0} 页成功".format(x))
## 调度函数执行抓取
get_douban_top250_all()
# print(len(year_list))
"""
建表语句:
CREATE TABLE `douban_movie250` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`url` varchar(255) DEFAULT NULL,
`moviename` varchar(255) DEFAULT NULL,
`nickname` varchar(255) DEFAULT NULL,
`playable` varchar(255) DEFAULT NULL,
`daoyan` varchar(255) DEFAULT NULL,
`zhuyan` varchar(255) DEFAULT NULL,
`year` varchar(255) DEFAULT NULL,
`district` varchar(255) DEFAULT NULL,
`type` varchar(255) DEFAULT NULL,
`rate` varchar(255) DEFAULT NULL,
`count` int(11) DEFAULT NULL,
`quote` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
"""
print([len(movie_url), len(movie_name), len(movie_other), len(playable_list), len(daoyan_list), len(zhuyan_list),
len(year_list), len(district_list), len(type_list), len(rating_list), len(count_list), len(quote_list)])
import pymysql
conn = pymysql.connect(host="hadoop02", port=3306, user="root", password="root", database="bigdata", charset="utf8")
cusor = conn.cursor()
sql = "insert into douban_movie250 (url, moviename, nickname, playable, daoyan, zhuyan, year, district, type, rate, count, quote) values " \
"(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
## 输出结果数据
for index in range(len(movie_url)):
# print(movie_url[index], movie_name[index], movie_other[index], playable_list[index], body_list[index])
# print(body_list[index])
# print(daoyan_list[index], zhuyan_list[index], sep="****************")
# print(year_list[index], district_list[index], type_list[index], rating_list[index], count_list[index], sep="*********************")
# print(quote_list[index])
record = [movie_url[index], movie_name[index], movie_other[index], playable_list[index], daoyan_list[index], zhuyan_list[index],
year_list[index], district_list[index], type_list[index], rating_list[index], count_list[index], quote_list[index]]
cusor.execute(sql, record)
conn.commit()
cusor.close()
conn.close()