本人用python 3.3 这版本的爬虫资料有点少,从基础总结吧
如何查看最新版本可用的函数呢?
>>> import urllib
>>> help(urllib)
Help on package urllib:
NAME
urllib
PACKAGE CONTENTS
error
parse
request
response
robotparser
FILE
c:\python33\lib\urllib\__init__.py
>>> import urllib.request
>>> dir(urllib.request)
['AbstractBasicAuthHandler', 'AbstractDigestAuthHandler', 'AbstractHTTPHandler', 'BaseHandler', 'CacheFTPHandler', 'ContentTooShortError', 'FTPHandler', 'FancyURLopener', 'FileHandler', 'HTTPBasicAuthHandler', 'HTTPCookieProcessor', 'HTTPDefaultErrorHandler', 'HTTPDigestAuthHandler', 'HTTPError', 'HTTPErrorProcessor', 'HTTPHandler', 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 'HTTPRedirectHandler', 'HTTPSHandler', 'MAXFTPCACHE', 'OpenerDirector', 'ProxyBasicAuthHandler', 'ProxyDigestAuthHandler', 'ProxyHandler', 'Request', 'URLError', 'URLopener', 'UnknownHandler', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__initializing__', '__loader__', '__name__', '__package__', '__version__', '_cut_port_re', '_ftperrors', '_have_ssl', '_localhost', '_noheaders', '_opener', '_parse_proxy', '_proxy_bypass_macosx_sysconf', '_randombytes', '_safe_gethostbyname', '_thishost', '_url_tempfiles', 'addclosehook', 'addinfourl', 'base64', 'bisect', 'build_opener', 'collections', 'contextlib', 'email', 'ftpcache', 'ftperrors', 'ftpwrapper', 'getproxies', 'getproxies_environment', 'getproxies_registry', 'hashlib', 'http', 'install_opener', 'io', 'localhost', 'noheaders', 'os', 'parse_http_list', 'parse_keqv_list', 'pathname2url', 'posixpath', 'proxy_bypass', 'proxy_bypass_environment', 'proxy_bypass_registry', 'quote', 're', 'request_host', 'socket', 'splitattr', 'splithost', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', 'ssl', 'sys', 'tempfile', 'thishost', 'time', 'to_bytes', 'unquote', 'unwrap', 'url2pathname', 'urlcleanup', 'urljoin', 'urlopen', 'urlparse', 'urlretrieve', 'urlsplit', 'urlunparse', 'warnings']
>>>
你可以看到上面很多函数都可以用,我们就要urlopen函数,写我的第一个爬虫程序吧
抓取网页的html数据
import urllib
#import urllib.request
response = urllib.request.urlopen('http://www.baidu.com')
html = response.read()
print(html)
今天下午花了点时间实现了,向豆瓣网页发送一个报表,并测试通过
import http.cookiejar
import sys
import urllib.request as request
import urllib.parse as parse
from bs4 import BeautifulSoup,Tag,CData
import re
#保存cookie,为登录后访问其它页面做准备
cj = http.cookiejar.LWPCookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cj))
request.install_opener(opener)
#获取验证码图片
login_path = 'http://www.douban.com/accounts/login'
login_page = request.urlopen(login_path)
soup_login = BeautifulSoup(login_page)
image = soup_login.find(id ='captcha_image')
image_url = image['src']
print(image_url)
input()
data = {
"form_email":"525***@qq.com",
"form_password":"******"
}
captcha = re.findall(r'=\w+:', image_url)
captcha = ''.join(captcha)
captcha = captcha[1:len(captcha) - 1 ]
print(captcha)
#提交表单
data['captcha-solution'] = captcha
post_data = parse.urlencode(data)
post_data = post_data.encode("utf-8")
res = request.urlopen(login_path, post_data)
print(res.status, res.reason)
处理异常 URLError, HTTPError
import urllib.request as request
import urllib.error as error
url = 'http://www.baidu.com'
flag = 1
try:
req=request.urlopen(url)
except (error.URLError,error.HTTPError) as e:
print('URLError')
flag = 0
if flag == 1:
print("Successful connection","status:"+str(req.status), req.reason)
else:
print("Connection Failed")
geturl()获取真实的url
import urllib.request as request
old_url = 'http://www.baidu.com'
req = request.Request(old_url)
response = request.urlopen(req)
print('old_url',old_url)
print('real_url',response.geturl())
info函数返回字典对象,描述获取页面信息
import urllib.request as request
old_url = 'http://www.baidu.com'
req = request.Request(old_url)
response = request.urlopen(req)
print('info',response.info())
打印百度的cookie信息
import urllib.request as request
import http.cookiejar
cookie = http.cookiejar.LWPCookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cookie))
url = 'http://www.baidu.com'
response = opener.open(url)
for item in cookie:
print('name=',item.name)
print('value=',item.value)