# coding=gbk # ==================================准备工作================================================== import ssl import html import json import re from urllib.request import urlopen ssl._create_default_https_context = ssl._create_unverified_context home = "https://www.dytt8.net" lst = [] # ============================# 得到要爬的网址 ============================================== def getPage_url(): response = urlopen(home).read().decode('gbk') son_url = re.finditer(r"最新电影下载</a>]<a href='(?P<url>.*?)'>",response) url_list = [] result_url_lst = [] for item in son_url: url_list.append(item.group("url")) for url in url_list: result_url_lst.append(home+url) return result_url_lst # ==========================================正则爬取=================================================== def parsePage(url_lst): for i in url_lst: dic = {} content = urlopen(i).read().decode("gbk") com = re.compile( r'<div id="Zoom".*?◎片 名(?P<name>.*? )<br />.*?◎主 演(?P<arts>.*?)<br /><br />.*?<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<url>.*?)">', re.S) ret = com.finditer(content) for item in ret: dic["movie name"] = item.group("name").replace(u'\u3000', u' ') str1 = item.group("arts").replace("<br />", "").replace(u'\u3000', u' ') str2 = html.unescape(str1) str1 = str2.split(" ") dic["starring"] = str1 dic["url"] = item.group("url").replace(u'\u3000', u' ') lst.append(dic) with open("爬虫文件.json", "w") as f: json.dump(lst, f, ensure_ascii=False) url_list = getPage_url() parsePage(url_list)
简单爬虫--电影天堂
最新推荐文章于 2024-10-18 19:24:23 发布