# -*- coding: UTF-8 -*-
import urllib2
import re
import ssl
import sys
if __name__ == "__main__":
#代理
proxy = {
'http': 'xxx',
'https': 'xxx'
}
ssl_context = ssl._create_unverified_context()
https_handler = urllib2.HTTPSHandler(context=ssl_context)
url = "https://www.i7wx.com/book/0/636/"
proxy_handler = urllib2.ProxyHandler(proxy)
opener = urllib2.build_opener(proxy_handler, https_handler)
response = opener.open(url)
# print response.read().decode('gbk')
pattern = re.compile(r'<a href="(\d*.html)">(.*?)</a>', re.I)
result = pattern.findall(response.read().decode('gbk'))
for k, v in result:
# print k, v
with open("./novel/" + v + ".txt", 'w') as f:
r = opener.open(url + k)
pattern = re.compile(r'<div id="content">(.*?)</div>')
# print r.read().decode('gbk')
match = pattern.findall(r.read().decode('gbk'))
p2 = re.compile(r'(<br/><br/>)')
p3 = re.compile(r' ')
if match:
sub = p2.sub('\n', match[0])
sub = p3.sub('', sub)
print sub
f.write(sub.encode('utf-8'))
f.close()
urllib2爬取小说三寸人间
最新推荐文章于 2023-12-15 11:58:44 发布
本文介绍了一种使用Python和正则表达式从特定网站抓取网络小说章节标题和内容的方法。通过设置代理和SSL上下文,实现了对HTTPS网站的安全访问。代码详细展示了如何解析网页,提取小说章节链接及标题,并进一步抓取各章节内容,最后将内容保存为TXT文件。
1211

被折叠的 条评论
为什么被折叠?



