1、urlib2和urllib两个内置模块实现http请求
例子:
import urllib2
res = urllib2.urlopen('http://www.zhihu.com')
html = res.read()
print html
或者
import urllib2
request = urllib2.Request('http://www.zhihu.com')
res = urllib2.urlopen(request)
html = res.read()
print html
或者
import urllib2
import urllib
url = 'http://test.vfast.com/login'
data = {'username': 'vfast',
'password': 'vfast'
}
data = urllib.urlencode(data)
result = urllib2.Request(url, data)
response = urllib2.urlopen(result)
html = response.read()
print html
添加header头
import urllib2
import urllib
url = 'https://mail.qq.com/'
data = {'username': 'xxxx@qq.com',
'password': 'xxxx'
}
user_agent = 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'
referer = 'https://mail.qq.com/'
headers = {'User-Agent': user_agent, 'Referer': referer}
data = urllib.urlencode(data)
result = urllib2.Request(url, data, headers)
response = urllib2.urlopen(result)
html = response.read()
print html
或者
import urllib2
import urllib
url = 'https://mail.qq.com/'
data = {'username': 'xxxx@qq.com',
'password': 'xxxx'
}
user_agent = 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'
referer = 'https://mail.qq.com/'
headers = {'User-Agent': user_agent, 'Referer': referer}
data = urllib.urlencode(data)
result = urllib2.Request(url)
result.add_header('User-Agent', user_agent)
result.add_header('Referer', referer)
result.add_data(data)
response = urllib2.urlopen(result)
html = response.read()
print html
获取cookie的值
import urllib2
import cookielib
cookie = cookielib.CookieJar()
res = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
response = res.open('http://www.zhihu.com')
for item in cookie:
print item.name + ':' + item.value
自定义cookie内容
import urllib2
opener = urllib2.build_opener()
opener.addheaders.append(('Cookie', 'email=' + 'xxxxx'))
requests = urllib2.Request('http://www.zhihu.com/')
response = opener.open(requests)
print response.headers
data = response.read()
print data
timeout设置超时
python2.6版本之前,urllib2的api没有暴露timeout,可以通过更改Socket的全局timeout
import urllib2 import socket socket.setdefaulttimeout(10) urllib2.socket.setdefaulttimeout(10)
python2.6之后,urlopen函数提供了timeout设置
import urllib2
resuest = urllib2.Request('http://www.zhihu.com')
response = urllib2.urlopen(resuest, timeout=2)
html = response.read()
print html
获取http响应码
import urllib2
try:
response = urllib2.urlopen('http://www.zhihu.com')
print response.code
except urllib2.HTTPError as e:
if hasattr(e, 'code'):
print 'Error code:', e.code
重定向
import urllib2
response = urllib2.urlopen('http://www.zhihu.com')
isRedirected = response.geturl() == 'http://www.zhihu.com' #检查重定向
自定义HTTPRedirectHandler类,不自动重定向
import urllib
class RedirectHandler(urllib.request.HTTPRedirectHandler):
def http_error_301(self, req, fp, code, msg, headers):
pass
def http_error_302(self, req, fp, code, msg, headers):
result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
result.status = code
result.newurl = result.geturl()
return result
opener = urllib.request.build_opener(RedirectHandler)
opener.open('http://www.zhihu.com')
Proxy代理设置
import urllib2
proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8087'})
opener = urllib2.build_opener(proxy,)
response = urllib2.urlopen('http://www.zhihu.com/')
print response.read()
2、httplib结合urllib实现http请求
创建HTTPConnection对象:class httplib.HTTPConnection(host[,port[,strict[,timeout[,source_address]]]])
发送请求:HTTPConnection.request(method,url[,body[,headers]])
获取响应:HTTPConnection.getresponse()
读取响应信息:HTTPResponse.read([amt])
获得指定头信息:HTTPResponse.getheader(name[,default])
获取响应头(header,value)元组的列表:HTTPResponse.getheaders()
获得底层socket文件描述符:HTTPResponse.fileno()
获得头内容:HTTPResponse.msg
获得头http版本:HTTPResponse.version
获得返回状态码:HTTPResponse.status
获得返回说明:HTTPResponse.reason
例子:
get请求
import httplib
conn = None
try:
conn = httplib.HTTPConnection('www.zhihu.com')
conn.request('GET', '/')
response = conn.getresponse()
print response.status, response.reason
print '-' * 40
headers = response.getheaders()
for h in headers:
print h
print '-' * 40
print response.msg
except Exception, e:
print e
finally:
if conn:
conn.close()
post请求
import httplib,urllib
conn = None
try:
params = urllib.urlencode({'name':'vfast','age':22})
headers = {'Content-type':'application/x-www-form-urlencoded','Accept':'text/plain'}
conn = httplib.HTTPConnection('www.zhihu.com',80,timeout=3)
conn.request('POST','/login',params,headers)
response = conn.getresponse()
print response.getheaders()
print response.status
print response.read()
except Exception,e:
print e
finally:
if conn:
conn.close()
python爬虫与项目实战学习记录
本文介绍了使用Python的urllib2和urllib模块以及httplib模块实现HTTP请求的方法,包括GET、POST操作,设置Header、Cookie,处理超时、重定向和Proxy。还提供了具体的代码示例,适用于Python爬虫和项目实战。
5113

被折叠的 条评论
为什么被折叠?



