三年前随便写的草稿忘记发布了,今天重新水一下,不知道还好用不,改天试跑一下。
大概就是通过etree的方式采集页面的文章列表,然后遍历一下存储成markdown文件。
PS:之前发的很多文章在各种公众号和优快云博客中被搬运,其实也是爬虫我们的文章之后然后搬运,由于跨平台,申诉起来也很麻烦,后来索性更新就少了,哎。
2023大环境不好,大家都要努力加油鸭~~
import requests
from lxml import etree
import html2text as ht
import time
import urllib.request
headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36') #可以从浏览器中的header中copy出来
opener = urllib.request.build_opener() #定义一个opener
opener.addheaders = [headers] #新增报头
url = 'https://blog.youkuaiyun.com/wsdc0521/article/list/'
text_maker = ht.HTML2Text()
text_maker.bypass_tables = False
#md = text.split('#')
for i in range(1,10):
print(f'-------------第{str(i)}页开始-------')
data=opener.open(url+str(i)).read().decode('utf-8','ignore')
selector = etree.HTML(data)
title_list = selector.xpath('//div[@class="article-item-box csdn-tracking-statistics"]/h4/a/text()')
url_list = selector.xpath('//div[@class="article-item-box csdn-tracking-statistics"]/h4/a/@href')
if url_list==[]:
break
for u in url_list:
d = opener.open(u).read().decode('utf-8', 'ignore')
s = etree.HTML(d)
info = s.xpath('//*[@id="article_content"]')[0]
print(u)
title= s.xpath('//h1[@class="title-article"]/text()')[0]
content = str(etree.tostring(info,encoding='utf-8'),'utf-8') #将标签对象转为html文本,然后将Byte转str
text = text_maker.handle(content)
try:
with open(f'markdown\{title}.md','w', encoding='utf-8') as f:
f.writelines('# '+title)
f.writelines(text)
except OSError as e:
print(e)
new_title=u.split('/')[-1]
with open(f'markdown\{new_title}.md','w', encoding='utf-8') as f:
f.writelines('# '+title)
f.writelines(text)
continue
print(f'文章已爬取并保存markdown:{title}')
print(f'-------------第{str(i)}页结束-------')