import requests
import re
'''
公众号文章爬取,
重要知识点:
1.用re.match方法来删除无用的文段。
2.不用selenium,而用element复制存为txt文件的办法,来获取临时的网址列表。
'''
with open('majiaju.txt','r',encoding='utf-8') as file:
txt_list = file.readlines()
txt_album = ''.join(txt_list)
# print(txt)
#
# url_album = 'https://mp.weixin.qq.com/s?__biz=MzA5OTQ1MzEwNQ==&mid=2650783700&idx=1&sn=c3d2e97f5c7f270214200895214b7306&chksm=88890e0abffe871cc1a5600c587ec57f0661cbf4bc2f50b6643619395dc758fc6bd3c0f919e9&scene=178&cur_album_id=1304564191399002113#rd'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
# r = requests.get(url=url_album, headers=headers)
# r.raise_for_status()
# r.encoding = r.apparent_encoding
# r = r.text
# print(r)
pat_html = r'data-link="(http:.*?)" data-title'
htmls = re.compile(pat_html).findall(txt_album)
print(htmls)
print(len(htmls))
htmls.reverse()
# # print(htmls)
i = 1
for html in htmls[0:91]:
print('正在下载:',html)
res = requests.get(url=html, headers=headers)
res.raise_for_status()
res.encoding = res.apparent_encoding
res = res.text
pat_h= r'<h2.*?>(.*?)</h2>'
txt_h = re.compile(pat_h,re.S).findall(res)[0]
txt_h= ''.join(txt_h)
# print('title——h2为:',txt_h)
print('*'*50)
index = '<br>' + str(i).rjust(2,'0') + '.'
title = '<h2>' + index + txt_h.strip() + '</h2>'
with open('马家驹over.html','a+',encoding='utf-8') as f:
f.write(title)
print('正在写入:',title)
pat_p= r'(<p.*?>.*?</p>)'
txt_p = re.compile(pat_p,re.S).findall(res)
# print('txt_p的类型是:',type(txt_p))
#模式字符串,剔除含有无用关键词的文段。
for txt in txt_p:
pattern = r'(赞赏)|(转账)|(陪你坚持学习)|(公众号)|(转账)|(“发现”)|(微信)|(发送)|(经方临床家)|(推荐)|(点击)'
match = re.search(pattern,txt)
if match == None:
with open('马家驹over.html','a+',encoding='utf-8') as f:
f.write('\n')
f.write(txt)
f.write('\n')
i += 1