一 pycurl介绍
pycurl模块为libcurl库提供了一个python接口。libcurl是一个开源免费且方便快捷的基于客户端的url传输库,支持FTP,HTTP,HTTPS,IMAP,IMAPS,LDAP,LDAPS,POP3,POP3S,RTMP,RTSP,SCP等等。libcurl还支持SSL认证,HTTP POST,HTTP PUT,FTP UPLOADING等等。和urllib模块类似,pycurl模块也可以用来获取一个url的对象。pycurl使用了大部分libcurl提供的函数,使得pycurl具有以下特性:
快速 libcurl本身就很快速,pycurl就是对libcurl进行了一次封装,所以pycurl同样很快速。
支持多种协议,SSL,认证和代理选项。pycurl支持大部分libcurl的回调函数。
multi 和 share 接口支持
可以和应用的I/O整合
二 pycurl使用案例
1.安装pycurl
CentOS6 下使用pip install pycurl安装
可以使用ipython来调试
2.获取一个url响应结果
import pycurl
from StringIO import StringIO
buffer=StringIO()
c=pycurl.Curl()
c.setopt(c.URL,'http://pycurl.io/')
c.setopt(c.WRITEFUNCTION,buffer.write)
c.perform()
c.close()
body=buffer.getvalue()
print(body)
pycurl本身不会存储url的响应结果,因此,需要设置一个buffer,让pycurl将结果写入到这个buffer中
想要获取调试信息,可以设置
c.setopt(c.VERBOSE, True)
等同于 curl -v
3.审查响应头
在实际案例中,我们想要根据服务端的编码格式来解码响应结果
import pycurl
import re
try:
from io import BytesIO
except ImportError:
from StringIO import StringIO as BytesIO
headers={}
def header_function(header_line):
# HTTP standard specifies that headers are encoded in iso-8859-1.
# On Python 2, decoding step can be skipped.
# On Python 3, decoding step is required.
header_line=header_line.decode('iso-8859-1')
# Header lines include the first status line (HTTP/1.x ...).
# We are going to ignore all lines that don't have a colon in them.
# This will botch headers that are split on multiple lines...
if ':' not in header_line:
return
# Break the header line into header name and value.
name, value = header_line.split(':', 1)
# Remove whitespace that may be present.
# Header lines include the trailing newline, and there may be whitespace
# around the colon.
name = name.strip()
value = value.strip()
# Header names are case insensitive.
# Lowercase name here.
name = name.lower()
# Now we can actually record the header name and value.
headers[name] = value
buffer=BytesIO()
c=pycurl.Curl()
c.setopt(c.URL,'http://pycurl.io')
c.setopt(c.WRITEFUNCTION,buffer.write)
#set our header function
c.setopt(c.HEADERFUNCTION,header_function)
c.perform()
c.close()
# Figure out what encoding was sent with the response, if any.
# Check against lowercased header name.
encoding=None
if 'content-type' in headers:
content_type=headers['content-type'].lower()
match=re.search('charset=(\S+)', content_type)
if match:
encoding=match.group(1)
print('Decoding using %s' % encoding)
if encoding is None:
# Default encoding for HTML is iso-8859-1.
# Other content types may have different default encoding,
# or in case of binary data, may have no encoding at all.
encoding='iso-8859-1'
print('Assuming encoding is %s' % encoding)
body=buffer.getvalue()
# Decode using the encoding we figured out.
print(body.decode(encoding))
|
4.将响应结果写入到文件
import pycurl
with open('out.html','wb') as f:
c=pycurl.Curl()
c.setopt(c.URL,'http://pycurl.io/')
c.setopt(c.WRITEDATA,f)
c.perform()
c.close()
这里最重要的部分就是以二进制模式打开文件,这样响应结果可以以字节码写入到文件中,不需要编码和解码。
5.跟踪url跳转
libcurl和pycurl默认不跟踪url跳转。
import pycurl
c=pycurl.Curl()
#Redirects to https://www.python.org/.
c.setopt(c.URL,'http://www.python.org/')
#Follow redirect
c.setopt(c.FOLLOWLOCATION,True)
c.perform()
c.close()
6.审查响应
import pycurl
try:
from io import BytesIO
except ImportError:
from StringIO import StringIO as BytesIO
buffer=BytesIO()
c=pycurl.Curl()
c.setopt(c.URL,'http://www.python.org/')
c.setopt(c.WRITEFUNCTION,buffer.write)
c.perform()
#Last used URL
print('Effective_url: %s' %c.getinfo(c.EFFECTIVE_URL))
#HTTP response code
print('Response_code: %d' %c.getinfo(c.RESPONSE_CODE))
#Total time of previous transfer
print('Total_time: %f' %c.getinfo(c.TOTAL_TIME))
#Time from start until name resolving completed
print('Namelookup_time: %f' %c.getinfo(c.NAMELOOKUP_TIME))
#Time from start until remote host or proxy completed
print('Connect_time: %f' %c.getinfo(c.CONNECT_TIME))
#Time from start until SLL/SSH handshake completed
print('SSL/SSH_time: %f' %c.getinfo(c.APPCONNECT_TIME))
#Time from start until just before the transfer begins
print('Pretransfer_time: %f' %c.getinfo(c.PRETRANSFER_TIME))
#Time from start until just when the first byte is received
print('Starttransfer_time: %f' %c.getinfo(c.STARTTRANSFER_TIME))
#Time taken for all redirect steps before the final transfer
print('Redirect_time: %f' %c.getinfo(c.REDIRECT_TIME))
#Total number of redirects that were followed
print('Redirect_count: %d' %c.getinfo(c.REDIRECT_COUNT))
#URL a redirect would take you to,had you enabled redirects
print('Redirect_url: %s' %c.getinfo(c.REDIRECT_URL))
#Number of bytes uploaded
print('Size_upload: %d' %c.getinfo(c.SIZE_UPLOAD))
#Average upload speed
print('Speed_upload: %f' %c.getinfo(c.SPEED_UPLOAD))
#Number of bytes downloaded
print('Size_download: %d' %c.getinfo(c.SIZE_DOWNLOAD))
#Average download speed
print('Speed_download: %f' %c.getinfo(c.SPEED_DOWNLOAD))
#getinfo must be called before close
c.close()
# python response_info.py
Effective_url: http://www.python.org/
Response_code: 301
Total_time: 0.105395
Namelookup_time: 0.051208
Connect_time: 0.078317
SSL/SSH_time: 0.000000
Pretransfer_time: 0.078322
Starttransfer_time: 0.105297
Redirect_time: 0.000000
Redirect_count: 0
Redirect_url: https://www.python.org/
Size_upload: 0
Speed_upload: 0.000000
Size_download: 0
Speed_download: 0.000000
|
|
|
|
7.发送表单数据
发送表单数据使用POSTFIELDS参数
import pycurl
try:
#python 3
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
c=pycurl.Curl()
c.setopt(c.URL,'http://pycurl.io/tests/testpostvars.php')
post_data={'field':'value'}
#Form data must be provided already urlencoded
postfields=urlencode(post_data)
# Sets request method to POST,
# Content-Type header to application/x-www-form-urlencoded
# and data to send in request body.
c.setopt(c.POSTFIELDS, postfields)
c.perform()
c.close()
8.文件上传
上传文件使用HTTPPOST参数,上传一个物理文件,使用FORM_FILE
import pycurl
c = pycurl.Curl()
c.setopt(c.URL, 'http://pycurl.io/tests/testfileupload.php')
c.setopt(c.HTTPPOST, [
('fileupload', (
# upload the contents of this file
c.FORM_FILE, __file__,
)),
])
c.perform()
c.close()
为上传的文件设置不同的文件名和内容类型
import pycurl
c = pycurl.Curl()
c.setopt(c.URL, 'http://pycurl.io/tests/testfileupload.php')
c.setopt(c.HTTPPOST, [
('fileupload', (
# upload the contents of this file
c.FORM_FILE, __file__,
# specify a different file name for the upload
c.FORM_FILENAME, 'helloworld.py',
# specify a different content type
c.FORM_CONTENTTYPE, 'application/x-python',
)),
])
c.perform()
c.close()
|
|
如果文件数据在内存中,使用BUFFER/BUFFERPTR
import pycurl
c = pycurl.Curl()
c.setopt(c.URL, 'http://pycurl.io/tests/testfileupload.php')
c.setopt(c.HTTPPOST, [
('fileupload', (
c.FORM_BUFFER, 'readme.txt',
c.FORM_BUFFERPTR, 'This is a fancy readme file',
)),
])
c.perform()
c.close()
9.处理FTP协议
import pycurl
c = pycurl.Curl()
c.setopt(c.URL, 'ftp://ftp.sunet.se/')
c.setopt(c.FTP_USE_EPSV, 1)
c.setopt(c.QUOTE, ['cwd pub', 'type i'])
c.perform()
c.close()
10.Sharing Data
import pycurl
import threading
print >>sys.stderr, 'Testing', pycurl.version
class Test(threading.Thread):
def __init__(self, share):
threading.Thread.__init__(self)
self.curl = pycurl.Curl()
self.curl.setopt(pycurl.URL, 'http://curl.haxx.se')
self.curl.setopt(pycurl.SHARE, share)
def run(self):
self.curl.perform()
self.curl.close()
s = pycurl.CurlShare()
s.setopt(pycurl.SH_SHARE, pycurl.LOCK_DATA_COOKIE)
s.setopt(pycurl.SH_SHARE, pycurl.LOCK_DATA_DNS)
t1 = Test(s)
t2 = Test(s)
t1.start()
t2.start()
del s
11.使用multi接口
libcurl的easy接口是一个同步的,高效的,上手快的用于文件传输的接口。multi接口是一个异步的接口,它可以使用一个或者多个线程进行多路传输。
multi接口比easy接口多了以下几个功能:
提供一个pull接口。使用libcurl的应用决定哪里何时询问libcurl去接收或者发送数据
在同一个线程中启动多路同步传输而不必使应用程序变得更复杂
使得应用程序同时等待在应用程序本身的文件描述符和libcurl文件描述符上的动作变得简单许多
使得基于事件处理和扩展的传输可以达到上千个并行连接
例1
import pycurl
m = pycurl.CurlMulti()
m.handles = []
c1 = pycurl.Curl()
c2 = pycurl.Curl()
c1.setopt(c1.URL, 'http://curl.haxx.se')
c2.setopt(c2.URL, 'http://cnn.com')
c2.setopt(c2.FOLLOWLOCATION, 1)
m.add_handle(c1)
m.add_handle(c2)
m.handles.append(c1)
m.handles.append(c2)
num_handles = len(m.handles)
while num_handles:
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
m.select(1.0)
m.remove_handle(c2)
m.remove_handle(c1)
del m.handles
m.close()
c1.close()
c2.close()
|
|
例2
import os, sys
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import pycurl
urls = (
"http://curl.haxx.se",
"http://www.python.org",
"http://pycurl.sourceforge.net",
"http://pycurl.sourceforge.net/tests/403_FORBIDDEN", # that actually exists ;-)
"http://pycurl.sourceforge.net/tests/404_NOT_FOUND",
)
# Read list of URIs from file specified on commandline
try:
urls = open(sys.argv[1], "rb").readlines()
except IndexError:
# No file was specified
pass
# init
m = pycurl.CurlMulti()
m.handles = []
for url in urls:
c = pycurl.Curl()
# save info in standard Python attributes
c.url = url.strip()
c.body = StringIO()
c.http_code = -1
m.handles.append(c)
# pycurl API calls
c.setopt(c.URL, c.url)
c.setopt(c.WRITEFUNCTION, c.body.write)
c.setopt(c.FOLLOWLOCATION,True)
m.add_handle(c)
# get data
num_handles = len(m.handles)
while num_handles:
while 1:
ret, num_handles = m.perform()
print ret,num_handles
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# currently no more I/O is pending, could do something in the meantime
# (display a progress bar, etc.)
m.select(1.0)
# close handles
for c in m.handles:
# save info in standard Python attributes
c.http_code = c.getinfo(c.HTTP_CODE)
# pycurl API calls
m.remove_handle(c)
c.close()
m.close()
# print result
for c in m.handles:
data = c.body.getvalue()
if 0:
print "**********", c.url, "**********"
print data
else:
print "%-53s http_code %3d, %6d bytes" % (c.url, c.http_code, len(data))
|
|
例3
import os, sys
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import pycurl
urls = (
"http://curl.haxx.se",
"http://www.python.org",
"http://pycurl.sourceforge.net",
"http://pycurl.sourceforge.net/THIS_HANDLE_IS_CLOSED",
)
# init
m = pycurl.CurlMulti()
m.handles = []
for url in urls:
c = pycurl.Curl()
# save info in standard Python attributes
c.url = url
c.body = StringIO()
c.http_code = -1
c.debug = 0
m.handles.append(c)
# pycurl API calls
c.setopt(c.URL, c.url)
c.setopt(c.WRITEFUNCTION, c.body.write)
c.setopt(c.FOLLOWLOCATION,True)
m.add_handle(c)
# debug - close a handle
if 1:
c = m.handles[3]
c.debug = 1
c.close()
# get data
num_handles = len(m.handles)
while num_handles:
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# currently no more I/O is pending, could do something in the meantime
# (display a progress bar, etc.)
m.select(1.0)
# close handles
for c in m.handles:
# save info in standard Python attributes
try:
c.http_code = c.getinfo(c.HTTP_CODE)
except pycurl.error:
# handle already closed - see debug above
assert c.debug
c.http_code = -1
# pycurl API calls
if 0:
m.remove_handle(c)
c.close()
elif 0:
# in the C API this is the wrong calling order, but pycurl
# handles this automatically
c.close()
m.remove_handle(c)
else:
# actually, remove_handle is called automatically on close
c.close()
m.close()
# print result
for c in m.handles:
data = c.body.getvalue()
if 0:
print "**********", c.url, "**********"
else:
print "%-53s http_code %3d, %6d bytes" % (c.url, c.http_code, len(data))
|
可以使用multi接口来缩短访问很多url的时间
假设一个文件中包含了很多个url,现在要通过脚本去访问每个url判断返回码是不是200
文件中共有87个url
方法一 使用python的for语句顺序访问每个url
import os,sys
import pycurl
from StringIO import StringIO
try:
if sys.argv[1]=="-":
urls=sys.stdin.readlines()
else:
urls=open(sys.argv[1],'rb').readlines()
#print urls
except:
print "Usage: %s check_urls.txt <file with urls to check>" %sys.argv[0]
raise SystemExit
class Curl:
def __init__(self,url):
self.url=url
self.body=StringIO()
self.http_code=0
self._curl=pycurl.Curl()
self._curl.setopt(pycurl.URL,self.url)
self._curl.setopt(pycurl.WRITEFUNCTION,self.body.write)
self._curl.setopt(pycurl.FOLLOWLOCATION,True)
self._curl.setopt(pycurl.NOSIGNAL,1)
def perform(self):
self._curl.perform()
def close(self):
self.http_code=self._curl.getinfo(pycurl.HTTP_CODE)
self._curl.close()
for url in urls:
url=url.strip()
if not url or url[0] == '#':
continue
c=Curl(url)
c.perform()
c.close()
print url, c.http_code
real 2m46.134s
user 0m0.134s
sys 0m0.185s
|
|
|
|
方法二 使用pycurl的CurlMulti()函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
from
StringIO
import
StringIO
import
pycurl
# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try
:
import
signal
from
signal
import
SIGPIPE,SIG_ING
signal.signal(signal.SIGPIPE,signal.SIG_IGN)
except
ImportError:
pass
# need a given txt file contains urls
try
:
if
sys.argv[
1
]
=
=
"-"
:
urls
=
sys.stdin.readlines()
else
:
urls
=
open
(sys.argv[
1
],
'rb'
).readlines()
#print urls
except
:
print
"Usage: %s check_urls.txt <file with urls to check>"
%
sys.argv[
0
]
raise
SystemExit
class
Curl:
def
__init__(
self
,url):
self
.url
=
url
self
.body
=
StringIO()
self
.http_code
=
0
self
._curl
=
pycurl.Curl()
self
._curl.setopt(pycurl.URL,
self
.url)
self
._curl.setopt(pycurl.FOLLOWLOCATION,
True
)
self
._curl.setopt(pycurl.WRITEFUNCTION,
self
.body.write)
self
._curl.setopt(pycurl.NOSIGNAL,
1
)
self
._curl.debug
=
0
def
perform(
self
):
self
._curl.perform()
def
close(
self
):
try
:
self
.http_code
=
self
._curl.getinfo(pycurl.HTTP_CODE)
except
pycurl.error:
assert
c.debug
self
.http_code
=
0
self
._curl.close()
def
print_result(items):
for
c
in
items:
data
=
c.body.getvalue()
if
0
:
print
"***************"
,c.url,
"******************"
print
data
elif
1
:
print
"%-60s %3d %6d"
%
(c.url,c.http_code,
len
(data))
def
test_multi():
handles
=
[]
m
=
pycurl.CurlMulti()
for
url
in
urls:
url
=
url.strip()
if
not
url
or
url[
0
]
=
=
'#'
:
continue
c
=
Curl(url)
m.add_handle(c._curl)
handles.append(c)
while
1
:
ret,num_handles
=
m.perform()
if
ret!
=
pycurl.E_CALL_MULTI_PERFORM:
break
while
num_handles:
m.select(
5.0
)
while
1
:
ret,num_handles
=
m.perform()
if
ret!
=
pycurl.E_CALL_MULTI_PERFORM:
break
for
c
in
handles:
c.close()
m.close()
print_result(handles)
if
1
:
test_multi()
|
1
2
3
|
real 2m46.049s
user 0m0.082s
sys 0m0.132s
|
在pycurl作者给的案例中,使用CurlMulti()接口处理多个url速度是最快的,但是当url数量多时速度并不快,而且有部分url还不能获取正确的返回值
方法三 使用python的多线程模块
python由于有GIL全局解释器锁的存在,python提供的threading模块不能充分利用多线程的优势,在多核CPU服务器上,统一时刻实际上只有一个线程在运行,其他线程都处于锁定状态。所以python的threading模块不适合用于处理CPU密集型任务,相反,threading线程数据量越多,速度越慢。但是对于I/O密集型或者网络密集型任务,还是可以使用threading模块
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
import
os,sys,time
import
threading
import
Queue
try
:
from
cStringIO
import
StringIO
except
ImportError:
from
StringIO
import
StringIO
import
pycurl
# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try
:
import
signal
from
signal
import
SIGPIPE,SIG_ING
signal.signal(signal.SIGPIPE,signal.SIG_IGN)
except
ImportError:
pass
# need a given txt file contains urls
try
:
if
sys.argv[
1
]
=
=
"-"
:
urls
=
sys.stdin.readlines()
else
:
urls
=
open
(sys.argv[
1
],
'rb'
).readlines()
#print urls
except
:
print
"Usage: %s check_urls.txt <file with urls to check>"
%
sys.argv[
0
]
raise
SystemExit
class
Curl:
def
__init__(
self
,url):
self
.url
=
url
self
.body
=
StringIO()
self
.http_code
=
0
self
._curl
=
pycurl.Curl()
self
._curl.setopt(pycurl.URL,
self
.url)
self
._curl.setopt(pycurl.FOLLOWLOCATION,
True
)
self
._curl.setopt(pycurl.CONNECTTIMEOUT,
15
)
self
._curl.setopt(pycurl.TIMEOUT,
15
)
self
._curl.setopt(pycurl.WRITEFUNCTION,
self
.body.write)
self
._curl.setopt(pycurl.NOSIGNAL,
1
)
self
._curl.debug
=
0
def
perform(
self
):
self
._curl.perform()
def
close(
self
):
try
:
self
.http_code
=
self
._curl.getinfo(pycurl.HTTP_CODE)
except
pycurl.error:
assert
c.debug
self
.http_code
=
0
self
._curl.close()
queue
=
Queue.Queue()
for
url
in
urls:
url
=
url.strip()
if
not
url
or
url[
0
]
=
=
"#"
:
continue
queue.put(url)
assert
queue.queue,
"no urls are given"
num_urls
=
len
(queue.queue)
#num_conn=min(num_conn,num_urls)
num_conn
=
num_urls
#assert 1 <= num_conn < = 1000,"invalid number of concurrent connections"
class
WorkerThread(threading.Thread):
def
__init__(
self
,queue):
threading.Thread.__init__(
self
)
self
.queue
=
queue
def
run(
self
):
while
1
:
try
:
url
=
self
.queue.get_nowait()
except
Queue.Empty:
raise
SystemExit
c
=
Curl(url)
c.perform()
c.close()
print
"http_url:"
+
url
+
"\t"
+
"http_code:"
+
str
(c.http_code)
#start a bunch of threads
threads
=
[]
for
dummy
in
range
(num_conn):
t
=
WorkerThread(queue)
t.start()
threads.append(t)
#wait for all threads to finish
for
thread
in
threads:
thread.join()
|
1
2
3
|
real 0m10.500s
user 0m0.149s
sys 0m0.196s
|
可以看到时间明显比以上两种方法所短了很多
所以,对于有大量url需要用pycurl来处理时,应该结合threading模块
参考资料:
http://pycurl.sourceforge.net/