1、代理文档格式:(代理采集地址 http://www.xicidaili.com)
2、免费代理稳定性不可靠,采用装饰器重连同时切换代理
# coding: utf-8
# pyhotn 2.7
# 小说棋 单篇小说采集 http://www.xs7.la/
# 替换第一章地址,总章节数。
# ip.txt 为代理池。
import urllib2
from bs4 import BeautifulSoup
import sys
import traceback
import random
import gzip
reload(sys)
sys.setdefaultencoding('utf-8')
f = open("out.txt", "a+")
headers = {
"Host": "www.xs7.la",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
"Content-Type": "text/html",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Referer": "http://www.xs7.la/book/18_18966/",
"Accept-Encoding": 'deflat'
}
url = "http://www.xs7.la/book/18_18966/7828246.html" # 第一章网址
page = 184 # 章节数
nextHref = url
ipPool = []
def IPpool():
reader = open('ip.txt')
line = reader.readline()
while line:
if line.strip() != '':
ipPool.append(line.split())
line = reader.readline()
reader.close()
RETRIES = 0
# 重试的次数
count = {"num": RETRIES}
def conn_try_again(function):
def wrapped(*args, **kwargs):
try:
return function(*args, **kwargs)
except Exception, err:
print("--重试访问,当前次数 %s ,(总次数11)--" % (count['num'] + 1))
if count['num'] < 10:
count['num'] += 1
return wrapped(*args, **kwargs)
else:
raise Exception(err)
return wrapped
bsObj = None
#判断编码格式
def getCoding(strInput):
'''
获取编码格式
'''
if isinstance(strInput, unicode):
return "unicode"
try:
strInput.decode("utf8")
return 'utf8'
except:
pass
try:
strInput.decode("gbk")
return 'gbk'
except:
pass
@conn_try_again
def getContent(url):
global nextHref, page, bsObj
# 定义一个代理开关
proxySwitch = True
try:
poolLen = len(ipPool)
if (poolLen > 0):
i = random.randint(0, poolLen - 1)
print(ipPool[i])
proxy_host = ipPool[i][2] + "://" + ipPool[i][0] + ":" + ipPool[i][1]
proxy_temp = {ipPool[i][2]: proxy_host}
proxy_support = urllib2.ProxyHandler(proxy_temp)
else:
print('--代理池当前无可用代理,使用本机地址访问--')
proxy_support = urllib2.ProxyHandler({})
nullproxy_handler = urllib2.ProxyHandler({"http": "124.172.232.49:8010"})
if proxySwitch:
opener = urllib2.build_opener(proxy_support)
else:
opener = urllib2.build_opener(nullproxy_handler)
urllib2.install_opener(opener)
req = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(req, timeout=3)
r = response.read()
encode=getCoding(r)
if(encode==None):
print(response.info().get('Content-Encoding'))
#gzip需要解压
else :
r = r.decode(encode)
# print(r)
bsObj = BeautifulSoup(r, 'lxml')
except Exception, err:
raise Exception(err)
# print(bsObj)
contentDiv = bsObj.find('div', id='content')
content = bsObj.find('div', id='content').get_text()
preAndNextBar = bsObj.find('div', id='thumb')
title = bsObj.find('div', id='bgdiv').h1.get_text()
if ("下一章" in preAndNextBar.get_text()):
next = None
aList = preAndNextBar.findAll('a')
for i in aList:
if ("下一章" in i.get_text()):
next = i
if (next == None):
print("下一章为空")
return True
nextHref = next.get('href')
print(title)
# print(content)
print(nextHref)
f.write("#####" + '\n')
f.write(title + '\n')
f.write(content + '\n')
count['num'] = 0
else:
return True
def main():
IPpool()
global page
try:
for num in range(1, page):
if (getContent(nextHref)):
break
print("--- end ---")
except Exception, e:
print(traceback.print_exc())
finally:
f.close()
main()
附:代理采集 https://blog.youkuaiyun.com/u012795120/article/details/80857990
下载地址:https://download.youkuaiyun.com/download/u012795120/10534448