简单爬虫--电影天堂

最新推荐文章于 2024-10-18 19:24:23 发布

不二郭

最新推荐文章于 2024-10-18 19:24:23 发布

阅读量328

点赞数

本文链接：https://blog.youkuaiyun.com/weixin_43957426/article/details/90693216

版权

该博客展示了一段Python代码实现电影信息爬虫的过程。代码通过导入相关库，先获取要爬取的网址，再使用正则表达式从网页内容中提取电影的片名、主演和下载链接等信息，最后将这些信息保存到JSON文件中。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

# coding=gbk
# ==================================准备工作==================================================
import ssl
import html

import json
import re
from urllib.request import urlopen
ssl._create_default_https_context = ssl._create_unverified_context
home = "https://www.dytt8.net"

lst = []


# ============================# 得到要爬的网址  ==============================================
def getPage_url():
    response = urlopen(home).read().decode('gbk')
    son_url = re.finditer(r"最新电影下载</a>]<a href='(?P<url>.*?)'>",response)
    url_list = []
    result_url_lst = []
    for item in son_url:
        url_list.append(item.group("url"))
    for url in url_list:
        result_url_lst.append(home+url)
    return result_url_lst


# ==========================================正则爬取===================================================
def parsePage(url_lst):
    for i in url_lst:
        dic = {}
        content = urlopen(i).read().decode("gbk")
        com = re.compile( r'<div id="Zoom".*?◎片　　名(?P<name>.*? )<br />.*?◎主　　演(?P<arts>.*?)<br /><br />.*?<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<url>.*?)">',
            re.S)
        ret = com.finditer(content)
        for item in ret:
            dic["movie name"] = item.group("name").replace(u'\u3000', u' ')
            str1 = item.group("arts").replace("<br />", "").replace(u'\u3000', u' ')
            str2 = html.unescape(str1)
            str1 = str2.split("       ")
            dic["starring"] = str1
            dic["url"] = item.group("url").replace(u'\u3000', u' ')
            lst.append(dic)
            with  open("爬虫文件.json", "w") as f:
                json.dump(lst, f, ensure_ascii=False)



url_list = getPage_url()
parsePage(url_list)