python 爬虫

本人用python 3.3 这版本的爬虫资料有点少,从基础总结吧

如何查看最新版本可用的函数呢?

>>> import urllib

>>> help(urllib)
Help on package urllib:

NAME
    urllib

PACKAGE CONTENTS
    error
    parse
    request
    response
    robotparser

FILE
    c:\python33\lib\urllib\__init__.py
>>> import urllib.request
>>> dir(urllib.request)
['AbstractBasicAuthHandler', 'AbstractDigestAuthHandler', 'AbstractHTTPHandler', 'BaseHandler', 'CacheFTPHandler', 'ContentTooShortError', 'FTPHandler', 'FancyURLopener', 'FileHandler', 'HTTPBasicAuthHandler', 'HTTPCookieProcessor', 'HTTPDefaultErrorHandler', 'HTTPDigestAuthHandler', 'HTTPError', 'HTTPErrorProcessor', 'HTTPHandler', 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 'HTTPRedirectHandler', 'HTTPSHandler', 'MAXFTPCACHE', 'OpenerDirector', 'ProxyBasicAuthHandler', 'ProxyDigestAuthHandler', 'ProxyHandler', 'Request', 'URLError', 'URLopener', 'UnknownHandler', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__initializing__', '__loader__', '__name__', '__package__', '__version__', '_cut_port_re', '_ftperrors', '_have_ssl', '_localhost', '_noheaders', '_opener', '_parse_proxy', '_proxy_bypass_macosx_sysconf', '_randombytes', '_safe_gethostbyname', '_thishost', '_url_tempfiles', 'addclosehook', 'addinfourl', 'base64', 'bisect', 'build_opener', 'collections', 'contextlib', 'email', 'ftpcache', 'ftperrors', 'ftpwrapper', 'getproxies', 'getproxies_environment', 'getproxies_registry', 'hashlib', 'http', 'install_opener', 'io', 'localhost', 'noheaders', 'os', 'parse_http_list', 'parse_keqv_list', 'pathname2url', 'posixpath', 'proxy_bypass', 'proxy_bypass_environment', 'proxy_bypass_registry', 'quote', 're', 'request_host', 'socket', 'splitattr', 'splithost', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', 'ssl', 'sys', 'tempfile', 'thishost', 'time', 'to_bytes', 'unquote', 'unwrap', 'url2pathname', 'urlcleanup', 'urljoin', 'urlopen', 'urlparse', 'urlretrieve', 'urlsplit', 'urlunparse', 'warnings']
>>> 

你可以看到上面很多函数都可以用,我们就要urlopen函数,写我的第一个爬虫程序吧

抓取网页的html数据

import urllib
#import urllib.request
response = urllib.request.urlopen('http://www.baidu.com')
html = response.read()
print(html)

 

今天下午花了点时间实现了,向豆瓣网页发送一个报表,并测试通过

import http.cookiejar 
import sys
import urllib.request as request
import urllib.parse as parse
from bs4 import BeautifulSoup,Tag,CData
import re

#保存cookie,为登录后访问其它页面做准备
cj = http.cookiejar.LWPCookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cj))
request.install_opener(opener)
#获取验证码图片
login_path = 'http://www.douban.com/accounts/login'
login_page = request.urlopen(login_path)
soup_login = BeautifulSoup(login_page)
image = soup_login.find(id ='captcha_image')
image_url = image['src']
print(image_url)
input()
data = {
    "form_email":"525***@qq.com",
    "form_password":"******"
    }
captcha = re.findall(r'=\w+:', image_url)
captcha = ''.join(captcha)
captcha = captcha[1:len(captcha) - 1 ]
print(captcha)

#提交表单
data['captcha-solution'] = captcha
post_data = parse.urlencode(data)
post_data = post_data.encode("utf-8")
res = request.urlopen(login_path, post_data)
print(res.status, res.reason)

 处理异常 URLError, HTTPError

import urllib.request as  request
import urllib.error as error
url = 'http://www.baidu.com'
flag = 1
try:
    req=request.urlopen(url)
except (error.URLError,error.HTTPError) as e:
    print('URLError')
    flag = 0
if flag == 1:
    print("Successful connection","status:"+str(req.status), req.reason)
else:
    print("Connection Failed")

geturl()获取真实的url

import urllib.request as  request
old_url = 'http://www.baidu.com'
req = request.Request(old_url)
response = request.urlopen(req)
print('old_url',old_url)
print('real_url',response.geturl())

info函数返回字典对象,描述获取页面信息

import urllib.request as  request
old_url = 'http://www.baidu.com'
req = request.Request(old_url)
response = request.urlopen(req)
print('info',response.info())

打印百度的cookie信息

import urllib.request as  request
import http.cookiejar 

cookie = http.cookiejar.LWPCookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cookie))
url = 'http://www.baidu.com'
response = opener.open(url)
for item in cookie:
    print('name=',item.name)
    print('value=',item.value)


 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值