初学python 做了个多进程的小爬虫,发现这个多进程下的运行结果和单进程的耗时基本一致,估计跟网络的带宽影响很大。只是简单的打印出了排名,真时链接和标题都没做。代码写的不怎么好~~
# -*- coding:utf-8 -*-
import urllib2
import urllib
import re
import pyquery
import random
import multiprocessing
import os
import math
# 获取url html数据
def getUrlHtml(url):
if not url:
return False
# url = 'http://www.baidu45465.com'
# httpHandler = urllib2.HTTPHandler(debuglevel=1)
# httpsHandler = urllib2.HTTPSHandler(debuglevel=1)
# opener = urllib2.build_opener(httpHandler,httpsHandler)
# urllib2.install_opener(opener)
headers = {}
headers['User-Agent'] = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
headers['Referer'] = "http://www.baidu.com"
request = urllib2.Request(url,None,headers)
response = urllib2.urlopen(request,timeout=5)
return response.read()
#解析html
def htmlParse(html):
doc=pyquery.PyQuery(html)
div_f13_obj=doc('div.f13')
if len(div_f13_obj)==False:
return False
# print '未匹配到class=f13 的div'
# exit()
snapshot_data = []
for i in div_f13_obj:
a_obj=pyquery.PyQuery(i).find('a.m')
if len(a_obj)!=2:
continue
#获取href
snapshot_url = urllib.unquote(a_obj.attr('href'))
#字符串分割
snapshot_url_list= snapshot_url.split('&')
t_list = []
for i in range(len(snapshot_url_list)):
if i==0:
t_list.append(['cache_url',snapshot_url+'&fast=y'])
else:
t_list.append(snapshot_url_list[i].split('='))
snapshot_data.append(dict(t_list))
del div_f13_obj
del doc
return snapshot_data
# 实现步骤
def main(start_num=0,end_num=75):
ip_data=('220.181.112.244',
'180.149.132.47',
'220.181.57.217',
'220.181.111.188',
'61.135.169.121')
if not isinstance(start_num,int):
print '开始位置参数必须为数字!'
exit();
elif not isinstance(end_num,int):
print '结束位置参数必须位数字'
exit()
elif start_num>end_num:
print '起始位置不能大于结束位置'
exit()
url_list = []
for i in range(start_num,end_num):
url = ("http://%s/s?ie=utf-8&f=8&rsv_idx=1&tn=baidu&wd=php函数&pn=%s&rsv_p"
"q=9b313f3b00017589&rsv_t=53adpjNTUHrouLAVtoaoEgUJJhHTP2xU3B2JZUz7t7PJU1tg8ZY10"
"W1Wd4g&rsv_enter=1&rsv_sug3=4&rsv_sug1=3&rsv_sug2=0&inputT=946&rsv_sug4=2421") % (ip_data[random.randint(0,len(ip_data)-1)],(i*10))
url_list.append(url)
#获取排名数据
info_data = []
for i in range(len(url_list)):
html=getUrlHtml(url_list[i])
if not html:
print 'html 为空'
continue
# 解析排名
page_info_data = htmlParse(html)
if page_info_data:
if info_data=='':
info_data=page_info_data
else:
info_data =info_data+page_info_data
else:
print '未解析到html排名信息'
continue
for i in info_data:
print i['p1']
if __name__ == '__main__':
print '主线程ID %s.' % os.getpid()
#设置最大页码
baidu_page_count = 75.0
# 一个进程最大页码
pool_page_count = 20.0
# 计算进程即将开启的数量
pool_num=int(math.ceil(baidu_page_count/pool_page_count))
# print pool_num
# exit()
#开启进程
p=multiprocessing.Pool(pool_num)
#每个进程执行数量
run_num=int(math.ceil(baidu_page_count/pool_num))
#临时标记
last_run_num = 0
for x in xrange(pool_num):
s_num = 0
e_num = 0
if x==0:
e_num = run_num
else:
end_page_num = run_num*x+run_num
if end_page_num>baidu_page_count:
s_num = last_run_num
e_num = int(baidu_page_count)
else:
s_num = last_run_num
e_num = end_page_num
last_run_num = last_run_num+run_num
# main(s_num,e_num)
#开启进程
p.apply_async(main(s_num,e_num),args=(x,))
print '等待进程完成....'
p.close()
p.join()
print '进程结束'