import csv #导入CSV模块写入
import re #导入正则包
import random #导入随机生成包
from ua_info import ua_list #导入构建的代理池
from urllib import request #导入urllib下request
#构建一个类
class Dyttspider(object):
#定义常量
def __init__(self):
self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
# self.url1 = 'https://www.dytt8.net/{}'
#正则解析函数
def parse_html(self, html):
bds = '<table width="100%".*?<td width="5%".*?class="ulink">(.*?)</a>.*?</table>'
pattern = re.compile(bds,re.S)
l_list = pattern.findall(html)
for i in l_list:
print('movie:%s \n' %i)
#print(l_list)
return l_list
#请求函数,并返回Unicode字符串
def get_html(self, url):
headers = {'User-Agent': random.choice(ua_list)}
req = request.Request(url=url, headers=headers)
res = request.urlopen(req)
html = res.read().decode('gb2312', 'ignore')
print('OK')
return html
#储存函数,将数据储存至目标文件
def save_file(self,l_list):
with open('maoyan.csv', 'w', newline='', encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(l_list)
#主函数,控制整体逻辑
def run(self):
url = self.url
html1 = self.get_html(url)
self.parse_html(html1)
#self.save_file(jiexi)
#以脚本启动
if __name__ == "__main__":
spider = Dyttspider()
spider.run()
二级页面下载地址爬取
于 2022-09-21 12:09:40 首次发布
2023

被折叠的 条评论
为什么被折叠?



