# coding: utf-8
import bs4
from bs4 import BeautifulSoup
import urllib2
import codecs
import time
import json
import sys
def novelFilter(content):
content=content.replace('<br />\n<br />','')
content=content.replace('<br />','')
content=content.replace(' ','')
content=content.replace('<dd id="contents">','')
content=content.replace('</dd>','')
return content
def novelFetch(url, title):
novel = urllib2.urlopen(url)
soup = BeautifulSoup(novel.read().decode('gbk', 'ignore'))
contents = '\n' + title + '\n' + str(soup.find('dd', id = 'contents'))
contents = novelFilter(contents)
print title.decode("utf-8")
return contents
def novelSelect(url, mark):
link = urllib2.urlopen(url)
soup = BeautifulSoup(link.read().decode('gbk', 'ignore'))
body = soup.findAll('td')
flag = False
cont = ''
href = ''
title = ''
for i in body:
try:
href = url + i.a['href']
title = str(i.a.string)
if flag and href:
cont += novelFetch(href, title)
if title.decode("utf-8") == mark:
flag = True
except:
pass
return {
'contents': cont,
'bookmark': title
}
def novelManage(info=0):
if info:
f = codecs.open('novel.json', 'w')
f.write(json.dumps(info, indent=2, ensure_ascii=False))
f.close()
else:
f = codecs.open('novel.json', 'r')
info = json.loads(f.read())
return info
if __name__=='__main__':
novels = novelManage()
hasUpdate = False
for title in novels:
novel = novelSelect(novels[title]['url'], novels[title]['bookmark'])
cont = novel['contents']
bookmark = novel['bookmark']
if cont and bookmark:
novels[title]['bookmark'] = bookmark
timesamp = time.strftime("%Y%m%d%H%M", time.localtime())
f = codecs.open(title + '_' + timesamp + '.txt', 'w', 'utf-8') #使用gbk格式
f.write(cont)
f.close()
hasUpdate = True
if not hasUpdate:
print '小说没有更新。'.decode('utf-8')
else:
novelManage(novels)
json
{
"不败战神": {
"url": "http://www.23us.com/html/27/27736/",
"bookmark": "第两百八十八节 唐天的判断"
},
"大主宰": {
"url": "http://www.23us.com/html/28/28373/",
"bookmark": "第一百九十五章 取巧"
},
"神级英雄": {
"url": "http://www.23us.com/html/42/42368/",
"bookmark": "第101章 牵动公会巨头的卷轴"
}
}
转自:http://www.oschina.net/code/snippet_254703_25144
运行结果截图: