最近开始学习gevent这个python并发库。作为一个轻量的库,它为各种网络和并发相关的操作提供了简洁的API。总所周知线程在虚拟内存和内核开销都很大,所以此时基于线程的并发意义并不是很大。而且由于python GIL(Global Interpreter Lock)的存在,在python里面使用线程很受限制。这也是Gevent这类并发库出现的原因。不过我最近尝试着在写一个短时间发起多个url请求demo,为了对比学习,我使用gevent版本和thread版本。结果却没有按着剧本走。好了废话不多说,直接代码吧:
thread版本
# _*_ coding: utf-8 _*_
import sys
reload(sys)
sys.setdefaultencoding("utf8")
import requests
import threading
import time
import urllib2
finished = 0
def GetUrl(pagenum):
url = 'http://opendata.baidu.com/zhaopin/s?p=mini&wd=%B0%D9%B6%C8&pn=' + \
str(pagenum*20) + '&rn=20'
return url
def setUrlSet():
for i in xrange(requestnum):
urlnum = i % 38
urlset.append(GetUrl(urlnum))
def GetResponse(pagenum):
try:
r = requests.get(urlset[pagenum])
except Exception, e:
print e
pass
def DigJobByPagenum(pagenum, requestnum):
init_num = pagenum
print '%d begin' % init_num
while pagenum < requestnum:
GetResponse(pagenum)
pagenum += threadnum
print '%d over' % init_num
def NormalThread(threadnum):
startime = time.time()
print "%s is running..." % threading.current_thread().name
threads = []
global finished, requestnum
for i in xrange(threadnum):
thread = threading.Thread(target=DigJobByPagenum, args=(i, requestnum))
threads.append(thread)
for t in threads:
t.daemon = True
t.start()
for t in threads:
t.join()
finished += 1
endtime = time.time()
print "%s is stop.The total time is %0.2f" % \
(threading.current_thread().name, (endtime - startime))
def GetAvageTime(array):
alltime = 0.0
for i in array:
alltime += i
avageTime = alltime/len(array)
return avageTime
if __name__ == '__main__':
threadnum = int(sys.argv[1])
requestnum = int(sys.argv[2])
print 'threadnum : %s,requestnum %s ' % (threadnum, requestnum)
originStartTime = time.time()
urlset = []
setUrlSet()
NormalThread(threadnum)
gevent 版本
# _*_ coding: utf-8 _*_
import sys
reload(sys)
sys.setdefaultencoding("utf8")
from gevent import monkey
monkey.patch_all()
import gevent
from gevent import pool
import requests
import time
finished = 0
def GetUrl(pagenum):
url = 'http://opendata.baidu.com/zhaopin/s?p=mini&wd=%B0%D9%B6%C8&pn=' + \
str(pagenum*20) + '&rn=20'
return url
def setUrlSet():
for i in xrange(requestnum):
urlnum = i % 38
urlset.append(GetUrl(urlnum))
def GetResponse(url):
startime = time.time()
r = requests.get(url)
print url
endtime = time.time()
spendtime = endtime - startime
NormalSpendTime.append(spendtime)
global finished
finished += 1
print finished
def GetAvageTime(array):
alltime = 0.0
for i in array:
alltime += i
avageTime = alltime/len(array)
return avageTime
def RunAsyncJob():
jobpool = pool.Pool(concurrent)
for url in urlset:
jobpool.spawn(GetResponse, url)
jobpool.join()
endtime = time.time()
allSpendTime = endtime - originStartime
print 'Total spend time is %0.3f, total request num is %s within %s \
seconds' % (allSpendTime, finished, timeoutNum)
print 'Each request time is %0.3f' % (GetAvageTime(NormalSpendTime))
if __name__ == '__main__':
concurrent = int(sys.argv[1])
requestnum = int(sys.argv[2])
timeoutNum = 100
NormalSpendTime = []
urlset = []
urlActionList = []
setUrlSet()
originStartime = time.time()
RunAsyncJob()
以上就是我的程序,接着我就用time工具简单测试了一下,数据如下:
命令格式:time python FILENAME.py 50 1000 (其中50为测试的线程数目或者gevent的pool数目,1000为请求次数)
gevent | thread | |
(50,1000) | real 0m19.636s user 0m5.224s sys 0m1.538s | real 0m16.869s user 0m4.339s sys 0m1.225s |
(60,1000) | real 0m19.596s user 0m5.160s sys 0m1.566s | real 0m17.516s user 0m4.206s sys 0m1.416s |
(40,1000) | real 0m18.221s user 0m5.275s sys 0m1.466s | real 0m18.231s user 0m4.200s sys 0m1.132s |
(50,2000) | real 0m33.694s user 0m10.281s sys 0m2.888s | real 0m44.301s user 0m8.320s sys 0m2.289s |
(50,3000) | real 0m50.416s user 0m15.298s sys 0m4.277s | real 0m47.471s user 0m12.232s sys 0m3.344s |
(50,500) | real 0m10.219s user 0m2.611s sys 0m0.765s | real 0m9.799s user 0m2.235s sys 0m0.652s |
(100,1000) | real 0m19.316s user 0m5.088s sys 0m1.735s | real 0m18.361s user 0m4.131s sys 0m1.225s |
注:表中数据的real、user、sys的意义可参看这里