Python学习笔记——20170829

本文介绍了一种使用Python中的HTMLParser模块抓取豆瓣电影信息的方法。通过定义特定的解析器类,可以有效地从网页中提取电影标题、评分、导演及演员等信息。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

HTMLParser

  • 方法注解
from html.parser import HTMLParser
#
class MyParser(HTMLParser):
    """
    HTMLParser
    """

    def __init__(self):
        HTMLParser.__init__(self)

    def handle_startendtag(self, tag, attrs):
        super().handle_startendtag(tag, attrs)

    # 处理开始标签<a>
    def handle_starttag(self, tag, attrs):
        pass

    # 处理结束标签</a>
    def handle_endtag(self, tag):
        pass

    # 处理特殊字符串,例如$#开头的
    def handle_charref(self, name):
        pass

    # 处理标签中的内容,比如<a href="http://www.baidu.com">baidu<a>
    def handle_data(self, data):
        pass

    # 处理注释
    def handle_comment(self, data):
        pass

    # 处理以<!开头的,比如<!DOCTYPE HTML>
    def handle_decl(self, decl):
        pass

    # 处理特殊字符,例如&nbsp
    def handle_entityref(self, name):
        pass

    # 处理<?instruction>
    def handle_pi(self, data):
        pass

Douban电影内容爬取

import requests
from html.parser import HTMLParser
#
class MovieParser(HTMLParser):
    """
    电影解析器
    """

    def __init__(self):
        HTMLParser.__init__(self)
        self.moives = []

    def handle_starttag(self, tag, attrs):

        def _attr(attrList, attrName):
            for attr in attrList:
                if attr[0] == attrName:
                    return attr[1]
            return None

        if tag == 'li' and _attr(attrs, 'data-title'):
            movie = {}
            movie['title'] = _attr(attrs, 'data-title')
            movie['score'] = _attr(attrs, 'data-score')
            if movie['score'] is None:
                movie['score'] = "None"
            movie['director'] = _attr(attrs, 'data-director')
            movie['actors'] = _attr(attrs, 'data-actors')
            self.moives.append(movie)
            # print('{movie[title]} | {movie[score]} | {movie[director]} | {movie[actors]}'.format(movie=movie))

    def error(self, message):
            pass
#
#
def my_movies(url):
    """
    网络请求
    :param url: 地址
    :return: 解析好的内容
    """

    mp = None
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        mp = MovieParser()
        mp.feed(response.text)
        return mp.moives
    except:
        return print('发生异常')
    finally:
        if mp is not None:
            mp.close()
#
#
def save_file(path, text):
    """
    文本存储
    :param path: 存储路径
    :param text: 文本内容
    :return: None
    """

    with open(path, 'w', encoding='UTF-8') as file:
        file.write(text)
#
#
if __name__ == '__main__':
    url = "https://movie.douban.com/cinema/nowplaying/chongqing/"
    text = my_movies(url)
    save_file("d:/upload/movies.json", str(text).replace('\'', '\"'))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值