豆瓣读书\豆瓣电影

最新推荐文章于 2024-09-09 20:19:48 发布

Jason_HHuang

最新推荐文章于 2024-09-09 20:19:48 发布

阅读量588

点赞数

分类专栏： Python 爬虫文章标签： lxml.etree._Element

本文链接：https://blog.youkuaiyun.com/qq_42281053/article/details/80669966

版权

Python 同时被 2 个专栏收录

30 篇文章

订阅专栏

爬虫

10 篇文章

订阅专栏

1. 获取豆瓣读书页信息，网址为：https://book.douban.com/，代码如下：

# coding:utf-8
import requests
from lxml import etree

# 1.获取豆瓣读书网页内容
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
    "Referer": "https://www.douban.com/"
}
url = "https://book.douban.com/"
response = requests.get(url, headers=headers)
text = response.text
# with open("book.html", "w") as fp:
#     fp.write(response.content)

# 2.通过一定规则获取html文件中的内容
html = etree.HTML(text)

ul = html.xpath("//ul[@class='list-col list-col5 list-express slide-item']")[0]
# print etree.tostring(ul, encoding="utf-8").decode("utf-8")
# 保存成html文件
# with open("ul.html", "w") as fp:
#     fp.write(etree.tostring(ul, encoding="utf-8"))

lis = ul.xpath(".//li")
# print etree.tostring(lis[0], encoding="utf-8").decode("utf-8")

# 通过循环获取lis下面的元素及属性
books = []
for li in lis:
    meta = li.xpath(".//div[@class='more-meta']")[0]
    # strip()去掉前后的空格
    # /text()爬取中间text文本
    title = meta.xpath(".//h4[@class='title']/text()")[0].strip()
    author = meta.xpath(".//span[@class='author']/text()")[0].strip()
    year = meta.xpath(".//span[@class='year']/text()")[0].strip()
    publisher = meta.xpath(".//span[@class='publisher']/text()")[0].strip()
    abstract = meta.xpath(".//p[@class='abstract']/text()")[0].strip()
    book = {
        "title": title,
        "author": author,
        "year": year,
        "publisher": publisher,
        "abstract": abstract
    }
    books.append(book)

# 3.保存抓取到的books信息
print books

2. 获取豆瓣电影页信息，网址为：https://movie.douban.com/cinema/nowplaying/beijing/，代码如下：

# -- coding:utf-8 --

import requests
from lxml import etree

# 1.将目标网站上的页面抓取下来
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36",
    "Referer": "https://movie.douban.com/"
}
url = "https://movie.douban.com/cinema/nowplaying/beijing/"
response = requests.get(url, headers=headers)
text = response.text
with open("responses.html", "w") as fp:
    fp.write(response.content)
    # 注意response.content数据类型和response.text数据类型

# 2.将抓取下来的数据根据一定的规则进行提取
html = etree.HTML(text)
ul = html.xpath("//ul[@class='lists']")[0]
lis = ul.xpath("./li")
movies = []
for li in lis:
    title = li.xpath("@data-title", encoding="utf-8")[0]
    score = li.xpath("@data-score", encoding="utf-8")[0]
    duration = li.xpath("@data-duration", encoding="utf-8")[0]
    region = li.xpath("@data-region", encoding="utf-8")[0]
    thumbnail = li.xpath(".//img/@src")[0]
    movie = {
        "title": title,
        "score": score,
        "duration": duration,
        "region": region,
        "thumbnail": thumbnail
    }
    movies.append(movie)
print movies

3. 电影天堂网站的爬取，http://www.ygdy8.net/html/gndy/dyzz/list_23_1.html

# coding:utf-8

import requests
from lxml import etree
import chardet
BASED_URL = "http://www.ygdy8.net"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
}


def get_detailed_urls(url):
    # 1.获取html元素
    response = requests.get(url, headers=HEADERS)
    # 注意使用response.content容易出问题
    text = response.text
    # 2.寻找detailed_urls
    html = etree.HTML(text)
    hrefs = html.xpath("//table[@class='tbspan']//a//@href")
    detailed_urls = map(lambda url: BASED_URL+url, hrefs)
    return detailed_urls


def parse_detailed_page(url):
    movie = {}
    # url = "http://www.ygdy8.net/html/gndy/dyzz/20180603/56925.html"
    # 1.获取页面元素
    response = requests.get(url, headers=HEADERS)
    text = response.content.decode("gbk")
    # 2.寻找相应的内容
    html = etree.HTML(text)
    title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    movie["title"] = title
    infor = html.xpath("//div[@id='Zoom']//span//text()")
    other_name = infor[2].replace("◎片　　名".decode("utf-8"), "").strip()
    movie["other_name"] = other_name
    year = infor[3].replace("◎年　　代".decode("utf-8"), "").strip()
    movie["year"] = year
    country = infor[4].replace("◎产　　地".decode("utf-8"), "").strip()
    movie["country"] = country
    typing = infor[5].replace("◎类　　别".decode("utf-8"), "").strip()
    movie["typing"] = typing
    return movie


def spider():
    movies = []
    base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
    for i in range(1, 8):
        url = base_url.format(i)
        detailed_urls = get_detailed_urls(url)
        for detailed_url in detailed_urls:
            movie = parse_detailed_page(detailed_url)
            movies.append(movie)
            for x in movie:
                print movie[x]
            break
        break
        

if __name__ == '__main__':
    spider()