根据urllib2的文档封装一个类,可处理post提交数据、gzip解压缩、cookie自动处理、通过代理下载等常用功能:
class HttpUrllib():
def __init__(self, proxies):
self.headerItem = {}
self.dataItem = {}
self.opener = None
if proxies:
self.opener = urllib2.build_opener(urllib2.ProxyHandler(proxies), urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
else:
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
def appendHeader(self, key, value):
self.headerItem[key] = value
print self.headerItem
def addData(self, key, values):
self.dataItem[key]= values
return self.dataItem
def fetchHtmlOnGet(self, url):
data = None
if self.dataItem:
data = urllib.urlencode(self.dataItem)
req = urllib2.Request(url, data, self.headerItem)
urlnet = self.opener.open(req)
htmlCode = urlnet.read()
encoding = urlnet.info().getheader('Content-Encoding')
if encoding:
if 'gzip' in encoding:
htmlCode = self.__ungzip(htmlCode)
urlnet.close()
return htmlCode
def __ungzip(self, content):
compressedstream = StringIO.StringIO(content)
new_content = gzip.GzipFile(fileobj = compressedstream).read()
return new_content