#python3.6
#爬取古诗文网的诗文
import requests
from bs4 import BeautifulSoup
import html5lib
import re
import os
def content(soup):
b = 1
poetrydict = dict()
for i in soup.find_all('a')[8:]:
if i.get('href'):
url = '%s%s' % ("https://so.gushiwen.org/",i.get('href'))
if (i.get('href') == "/gushi/tangshi.aspx" or i.get('href') =="/gushi/xiaowen.aspx"):
# 到唐诗三百或小学文言即结束
break
else:
url=i.get('href')
sbody = re.sub(u"\\(.*?\\)", "", str(i.text))
# 抽取题目(汉字)去掉题目中的括号以及括号中的内容
# sbody=re.sub("\\·","",sbody).strip()
# 去掉题目中的"·"符号
print(b, sbody,url)
poetrydict[sbody]=url
b=b+1
return poetrydict
#返回一个键为诗的题目,值为诗文链接的字典
def fulltext(pdict):
c=b=d=0
poetrydict=dict()
for i in pdict:
if pdict[i]:
s= requests.get(pdict[i])
soup= BeautifulSoup(s.text, 'html5lib')
text2=soup.find_all('div')[9]
poetry = str(text2.find_all('div')[4].text)
poetry = re.sub('\s+', '', poetry).strip()
#替换调空白(空格、换行)
poetry = re.sub(u"\\(.*?\\)", "", poetry)
#去掉括号括起来的通假字还有括号
poetrydict[i]=formattext(poetry)
else:
poetrydict[i]=None
return poetrydict
# 将诗文和题目存进一个字典中,并将此字典作为函数的返回
def formattext(s):
s = re.sub('\\。', '。\n', s)
s = re.sub('\\!', '!\n', s)
s = re.sub('\\?', '?\n', s)
s = re.sub('\\:', ':\n', s)
s = re.sub('\\;', ':\n', s)
s = re.sub('\\「','', s)
s = re.sub('\\」', '', s)
s = re.sub('\\《', '', s)
s = re.sub('\\》', '', s).strip()
return s
# 对输出诗文做一些格式上的调整,还有待完善
def output(dict,text):
for i in dict:
# print(dict[i])
print("####", i, "\n", dict[i], file=text)
print( i, "\n", dict[i])
text.close()
if __name__ == '__main__':
strc = input("请输入(类别)链接:")
## 如,https://so.gushiwen.org/gushi/tangshi.aspx
#网页右侧边栏
sc = input("请输入文件名:")
sw = requests.get(strc)
soup = BeautifulSoup(sw.text, 'html5lib')
s='D:\\pythonPROJECT\\名字分析\\'+sc+".md"
# 存储目录
file = open(s, "w", encoding='utf-8')
output(fulltext(content(soup)),file)
标签:sub,get,text,poetrydict,爬取,re,dict,古诗文,古诗词
来源: https://www.cnblogs.com/loeFairy/p/12244110.html