注意编码:读取html文件,get网页,写文件。
使用python3+beautifulsoup+requests。
最后生成xml文件,让android使用。
from bs4 import BeautifulSoup
import re
import requests
xml = '''
<chapter>
<idx>{0}</idx>
<title>{title}</title>
<content>
{content}
</content>
<note>
{note}
</note>
<comment>
{comment}
</comment>
</chapter>
'''
def parse(idx_chanpter, html):
#with open("c4.html",'rb') as fp:
# html = fp.read().decode('utf8')
soup = BeautifulSoup(html, "lxml")
lines = soup.findAll('p',class_='STYLE4')
# 初始化3个字符串
content = ''
note = ''
comment = ''
m_list = [content, note, comment]
idx = 0
for line in lines:
txt = line.text.strip()
# 跳过<p align="left" class="STYLE4">第九章 <BR> [原文] </p>
if re.search(r'第*章', txt, re.M|re.I):
txt = '[原文]'
#print('matched')
if txt == '[原文]':
idx = 0
elif txt == '[译文]':
idx = 1
elif txt == '[注释]':
idx = 2
elif txt == '[引语]' or txt == '[评论]':
break
else:
m_list[idx] += txt + '\n'
xml_2 = xml.format(idx_chanpter, title='第%d章'%idx_chanpter, content=m_list[0], note=m_list[1], comment=m_list[2])
#print(xml_2)
return xml_2
if __name__ == '__main__':
print('开始...\n')
header = '''
<?xml version="1.0" encoding="utf-8"?>
<article>
'''
with open('道德经.xml','wb+') as file:
file.write('<?xml version="1.0" encoding="utf-8"?>\n<article>\n<title>道德经</title>\n'.encode('utf-8'))
for i in range(1,3):
url = 'http://www.daodejing.???/%d.html' % i
r = requests.get(url)
r.encoding = 'gb2312'
chapter = parse(i, r.text)
file.write(chapter.encode('utf-8'))
file.write('</article>'.encode('utf-8'))
print('\n...结束')
《道德经》共81章,手动复制,在5个小时内也能做完,可是实在是太令人无聊了。
学会爬虫,可以抓取其他文章时,更省事。