1. 获取豆瓣读书页信息,网址为:https://book.douban.com/,代码如下:
# coding:utf-8
import requests
from lxml import etree
# 1.获取豆瓣读书网页内容
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
"Referer": "https://www.douban.com/"
}
url = "https://book.douban.com/"
response = requests.get(url, headers=headers)
text = response.text
# with open("book.html", "w") as fp:
# fp.write(response.content)
# 2.通过一定规则获取html文件中的内容
html = etree.HTML(text)
ul = html.xpath("//ul[@class='list-col list-col5 list-express slide-item']")[0]
# print etree.tostring(ul, encoding="utf-8").decode("utf-8")
# 保存成html文件
# with open("ul.html", "w") as fp:
# fp.write(etree.tostring(ul, encoding="utf-8"))
lis = ul.xpath(".//li")
# print etree.tostring(lis[0], encoding="utf-8").decode("utf-8")
# 通过循环获取lis下面的元素及属性
books = []
for li in lis:
meta = li.xpath(".//div[@class='more-meta']")[0]
# strip()去掉前后的空格
# /text()爬取中间text文本
title = meta.xpath(".//h4[@class='title']/text()")[0].strip()
author = meta.xpath(".//span[@class='author']/text()")[0].strip()
year = meta.xpath(".//span[@class='year']/text()")[0].strip()
publisher = meta.xpath(".//span[@class='publisher']/text()")[0].strip()
abstract = meta.xpath(".//p[@class='abstract']/text()")[0].strip()
book = {
"title": title,
"author": author,
"year": year,
"publisher": publisher,
"abstract": abstract
}
books.append(book)
# 3.保存抓取到的books信息
print books
2. 获取豆瓣电影页信息,网址为:https://movie.douban.com/cinema/nowplaying/beijing/,代码如下:
# -- coding:utf-8 --
import requests
from lxml import etree
# 1.将目标网站上的页面抓取下来
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36",
"Referer": "https://movie.douban.com/"
}
url = "https://movie.douban.com/cinema/nowplaying/beijing/"
response = requests.get(url, headers=headers)
text = response.text
with open("responses.html", "w") as fp:
fp.write(response.content)
# 注意response.content数据类型和response.text数据类型
# 2.将抓取下来的数据根据一定的规则进行提取
html = etree.HTML(text)
ul = html.xpath("//ul[@class='lists']")[0]
lis = ul.xpath("./li")
movies = []
for li in lis:
title = li.xpath("@data-title", encoding="utf-8")[0]
score = li.xpath("@data-score", encoding="utf-8")[0]
duration = li.xpath("@data-duration", encoding="utf-8")[0]
region = li.xpath("@data-region", encoding="utf-8")[0]
thumbnail = li.xpath(".//img/@src")[0]
movie = {
"title": title,
"score": score,
"duration": duration,
"region": region,
"thumbnail": thumbnail
}
movies.append(movie)
print movies
3. 电影天堂网站的爬取,http://www.ygdy8.net/html/gndy/dyzz/list_23_1.html
# coding:utf-8
import requests
from lxml import etree
import chardet
BASED_URL = "http://www.ygdy8.net"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
}
def get_detailed_urls(url):
# 1.获取html元素
response = requests.get(url, headers=HEADERS)
# 注意使用response.content容易出问题
text = response.text
# 2.寻找detailed_urls
html = etree.HTML(text)
hrefs = html.xpath("//table[@class='tbspan']//a//@href")
detailed_urls = map(lambda url: BASED_URL+url, hrefs)
return detailed_urls
def parse_detailed_page(url):
movie = {}
# url = "http://www.ygdy8.net/html/gndy/dyzz/20180603/56925.html"
# 1.获取页面元素
response = requests.get(url, headers=HEADERS)
text = response.content.decode("gbk")
# 2.寻找相应的内容
html = etree.HTML(text)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movie["title"] = title
infor = html.xpath("//div[@id='Zoom']//span//text()")
other_name = infor[2].replace("◎片 名".decode("utf-8"), "").strip()
movie["other_name"] = other_name
year = infor[3].replace("◎年 代".decode("utf-8"), "").strip()
movie["year"] = year
country = infor[4].replace("◎产 地".decode("utf-8"), "").strip()
movie["country"] = country
typing = infor[5].replace("◎类 别".decode("utf-8"), "").strip()
movie["typing"] = typing
return movie
def spider():
movies = []
base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
for i in range(1, 8):
url = base_url.format(i)
detailed_urls = get_detailed_urls(url)
for detailed_url in detailed_urls:
movie = parse_detailed_page(detailed_url)
movies.append(movie)
for x in movie:
print movie[x]
break
break
if __name__ == '__main__':
spider()