使用python抓取小说


# coding: utf-8
import bs4
from bs4 import BeautifulSoup
import urllib2
import codecs
import time
import json
import sys


def novelFilter(content):
  content=content.replace('<br />\n<br />','')
  content=content.replace('<br />','')
  content=content.replace(' ','')
  content=content.replace('<dd id="contents">','')
  content=content.replace('</dd>','')
  return content

def novelFetch(url, title):
  novel = urllib2.urlopen(url)
  soup = BeautifulSoup(novel.read().decode('gbk', 'ignore'))
  contents = '\n' + title + '\n' + str(soup.find('dd', id = 'contents'))
  contents = novelFilter(contents)

  print title.decode("utf-8")
  return contents

def novelSelect(url, mark):
  link = urllib2.urlopen(url)
  soup = BeautifulSoup(link.read().decode('gbk', 'ignore'))
  body = soup.findAll('td')

  flag = False
  cont = ''
  href = ''
  title = ''

  for i in body:
    try:
      href = url + i.a['href']
      title = str(i.a.string)

      if flag and href:
        cont += novelFetch(href, title)

      if title.decode("utf-8") == mark:
        flag = True

    except:
      pass

  return {
    'contents': cont,
    'bookmark': title
  }

def novelManage(info=0):
  if info:
    f = codecs.open('novel.json', 'w')
    f.write(json.dumps(info, indent=2, ensure_ascii=False))
    f.close()
  else:
    f = codecs.open('novel.json', 'r')
    info = json.loads(f.read())
  return info

if __name__=='__main__':
  novels = novelManage()
  hasUpdate = False

  for title in novels:
    novel = novelSelect(novels[title]['url'], novels[title]['bookmark'])

    cont = novel['contents']
    bookmark = novel['bookmark']

    if cont and bookmark:
      novels[title]['bookmark'] = bookmark

      timesamp = time.strftime("%Y%m%d%H%M", time.localtime())
      f = codecs.open(title + '_' + timesamp + '.txt', 'w', 'utf-8') #使用gbk格式
      f.write(cont)
      f.close()

      hasUpdate = True

  if not hasUpdate:
    print '小说没有更新。'.decode('utf-8')
  else:
    novelManage(novels)

json

{
  "不败战神": {
    "url": "http://www.23us.com/html/27/27736/", 
    "bookmark": "第两百八十八节 唐天的判断"
  }, 
  "大主宰": {
    "url": "http://www.23us.com/html/28/28373/", 
    "bookmark": "第一百九十五章 取巧"
  }, 
  "神级英雄": {
    "url": "http://www.23us.com/html/42/42368/", 
    "bookmark": "第101章 牵动公会巨头的卷轴"
  }
}


转自:http://www.oschina.net/code/snippet_254703_25144

运行结果截图:

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值