1.setting.py模板替换文件代码:
代码:
import random
Interval = random.randint(0, 1) + random.random()
wechat_settings = {'LOG_FILE': 'wechat_spider.log', ## 日志文件
'LOG_ENABLE' : True,
'LOG_ENCODING':'utf-8',
'LOG_LEVEL' : 'DEBUG', ## DEBUG,WARNING
#下载器从同一网站下载连续页面之前应等待的时间(以秒为单位)。这可以用来限制爬行速度,以避免过于严重地击中服务器。
'DOWNLOAD_DELAY':Interval, ## 下载延时
'DOWNLOAD_TIMEOUT':30, ## 超时限制
"DEFAULT_REQUEST_HEADERS": ## 请求头
{'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
# 'user_agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1'
},
}
gdsp_url = 'http://www.gsdata.cn/rank/toparc?wxname=RQ0BRDTSUJDqMi2nMgA5O0O0O2O0O0O1&wx=GDSP360&sort=-1' # 广电时评
yjtb_url = 'http://www.gsdata.cn/rank/toparc?wxname=eQWBlDuSZJ2qpi1ndgG59211YmFu&wx=yingjutouban&sort=-1' # 影剧头版
# 'http://www.gsdata.cn/rank/wxdetail?wxname=RQ0BRDTSUJDqMi2nMgA5O0O0O2O0O0O1'
wytb_url = 'http://www.gsdata.cn/rank/toparc?wxname=dQ2BVDuSeJXqVi0nbg35V2i1YW4O0O0O&wx=wenyutouban&sort=-1' # 文娱头版
wndu_url='http://www.gsdata.cn/rank/toparc?wxname=dQGBhDlScJGq9ilnbgW5Z2v1cnlvdQO0O0OO0O0O&wx=thepoemforyou&sort=-1'
DUPEFILTER_REDIS_KEY = 'wechat:dupefilter'
import hashlib
def get_md5(src):
a = src.encode('utf-8')
md5 = hashlib.md5()
md5.update(a)
a_md5 = md5.hexdigest()
return a_md5
2. middlewares.py模板替换文件代码:
代码:
import random
class UserAgentDownloadMiddleware(object):
USER_AGENT = ['Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6.4; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; chromeframe; Avant Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; InfoPath.1; .NET CLR 3.0.4506.',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; XH; rv:8.578.498) fr, Gecko/20121021 Camino/8.723+ (Firefox compatible)',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en; rv:1.8.1.4pre) Gecko/20070511 Camino/1.6pre',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (iPhone; U; Linux i686; pt-br) AppleWebKit/532+ (KHTML, like Gecko) Version/3.0 Mobile/1A538b Safari/419.3 Midori/0.2.0']
def process_request(self,request,spider):
user_agent = random.choice(self.USER_AGENT)
request.headers['User_Agent'] = user_agent