公众号文章爬取（马家驹文章）

最新推荐文章于 2022-03-12 16:30:44 发布

最懂编程的医生

最新推荐文章于 2022-03-12 16:30:44 发布

阅读量252

点赞数

CC 4.0 BY-SA版权

分类专栏：爬虫

本文链接：https://blog.youkuaiyun.com/weixin_40901505/article/details/114702919

爬虫专栏收录该内容

10 篇文章

订阅专栏



import requests
import re

'''
公众号文章爬取，
重要知识点：
1.用re.match方法来删除无用的文段。
2.不用selenium，而用element复制存为txt文件的办法，来获取临时的网址列表。

'''
with open('majiaju.txt','r',encoding='utf-8') as file:
	txt_list = file.readlines()
	txt_album = ''.join(txt_list)
	# print(txt)
	# 
# url_album = 'https://mp.weixin.qq.com/s?__biz=MzA5OTQ1MzEwNQ==&mid=2650783700&idx=1&sn=c3d2e97f5c7f270214200895214b7306&chksm=88890e0abffe871cc1a5600c587ec57f0661cbf4bc2f50b6643619395dc758fc6bd3c0f919e9&scene=178&cur_album_id=1304564191399002113#rd'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}

# r = requests.get(url=url_album, headers=headers)
# r.raise_for_status()
# r.encoding = r.apparent_encoding
# r = r.text
# print(r)

pat_html = r'data-link="(http:.*?)" data-title'
htmls = re.compile(pat_html).findall(txt_album)
print(htmls)
print(len(htmls))
htmls.reverse()
# # print(htmls)

i = 1
for html in htmls[0:91]:
	print('正在下载：',html)
	res = requests.get(url=html, headers=headers)
	res.raise_for_status()
	res.encoding = res.apparent_encoding
	res = res.text

	pat_h= r'<h2.*?>(.*?)</h2>'
	txt_h = re.compile(pat_h,re.S).findall(res)[0]
	txt_h= ''.join(txt_h)
	# print('title——h2为：',txt_h)
	print('*'*50)
	index = '<br>' + str(i).rjust(2,'0') + '.'
	title = '<h2>' + index + txt_h.strip() + '</h2>'
	with open('马家驹over.html','a+',encoding='utf-8') as f:
		f.write(title)
		print('正在写入：',title)

	pat_p= r'(<p.*?>.*?</p>)'
	txt_p = re.compile(pat_p,re.S).findall(res)
	# print('txt_p的类型是：',type(txt_p))
	
	#模式字符串，剔除含有无用关键词的文段。
	for txt in txt_p:
		pattern = r'(赞赏)|(转账)|(陪你坚持学习)|(公众号)|(转账)|(“发现”)|(微信)|(发送)|(经方临床家)|(推荐)|(点击)'
		match = re.search(pattern,txt)
		if match == None:			
			with open('马家驹over.html','a+',encoding='utf-8') as f:
				f.write('\n')
				f.write(txt)
				f.write('\n')
	i += 1