思路:
获取网页源码,解析获取内容,利用正则取出所需内容后写入文本文档


1 ''' 2 Created on xxx 3 中庸 4 @author: xxx 5 ''' 6 import requests, re, os 7 8 def getContent(url, path): 9 # 获取网页信息 10 html = requests.get(url).text 11 content = re.findall('<p>.*', html) 12 13 # 获取标题 14 title = re.findall('<h1>(.*)</h1>', html) 15 title = title[0] 16 17 # 判断目录是否存在 18 if not os.path.exists(path): 19 # 不存在创建目录 20 print('正创建目录,请稍候...') 21 os.mkdir(path) 22 print('目录创建完成') 23 if os.path.exists(path): 24 # 重新定义目录 25 path = path + "\\" + title + '.txt' 26 f = open(path, 'w+', encoding='utf8') 27 28 # 获取内容 29 for i in content: 30 i = i.replace('<p> ', '') 31 # 写入文本 32 f.write(i) 33 print('文档写入完成!') 34 f.close() 35 36 path = r'E:\eclipse\workspace\zhongyong' 37 for i in range(1, 34): 38 url = 'http://www.shicimingju.com/book/zhongyong/%s.html' % i 39 getContent(url, path)