Python 爬虫(二)

requests
警告:非专业使用其他HTTP库会导致危险的副作用,包括:安全缺陷症、冗余代码症、重新发明轮子症、啃文档症等
Requests允许你发送纯天然的HTTP/1.1请求,不需要手动为URL添加查询字串,也不需要对POST数据进行表单编码。Keep-alive和HTTP连接池的功能是100%自动化的,一切动力都来自于根植Requests内部的urllib3。

  • 发送请求
import requests
url = 'http://www.baidu.com'
r= requests.get(url)
print(r.status_code)
print(r.text)
import requests
url = 'http://httpbin.org/get'
params = {'k1':'v1', 'k2':'v2'}
r= requests.get(url,params)
print(r.url)

输出:
http://httpbin.org/get?k1=v1&k2=v2

若'k2'后为数组:
即:
params = {'k1':'v1', 'k2':[1,2,3]}
则输出:
http://httpbin.org/get?k1=v1&k2=1&k2=2&k2=3

若为None,则不显示,即:
params = {'k1':'v1', 'k2':None}

http://httpbin.org/get?k1=v1
  • 二进制数据处理
import requests
from io import BytesIO
from PIL import Image
r = requests.get('https://ss1.bdstatic.com/70cFvXSh_Q1YnxGkpoWK1HF6hhy/it/u=3429926207,375734078&fm=117&gp=0.jpg', stream = True)
image = Image.open(BytesIO(r.content))
image.save('meinv.jpg')
  • json数据处理
import requests
r = requests.get('https://github.com/timeline.json')
print(type(r.json))
print(r.text)
  • 原始数据处理
import requests
r = requests.get('https://ss1.bdstatic.com/70cFvXSh_Q1YnxGkpoWK1HF6hhy/it/u=3429926207,375734078&fm=117&gp=0.jpg', stream = True)
with open('meinv3.jpg', 'wb+') as f:
    for chunk in r.iter_content(1024):
        f.write(chunk)
  • 提交表单
import requests
form = {'username':'user', 'password':'pass'}
r = requests.post('http://httpbin.org/post', data = form)
print(r.text)


或者换为:
r = requests.post('http://httpbin.org/post', data = json.dumps(form))
  • cookie
url = 'http://www.baidu.com'
import requests
r = requests.get(url)
cookies = r.cookies
for k,v in cookies.get_dict().items():
    print(k,v)
输出:
BDORZ 27315
import requests
cookies = {'c1':'v1', 'c2':'v2'}
r = requests.get('http://httpbin.org/cookies', cookies = cookies)
print(r.text)

输出:
{
  "cookies": {
    "c1": "v1", 
    "c2": "v2"
  }
}
  • 重定向和重定向历史
    现在很多访问http的网站给你重定向到https的站点
import requests
r = requests.head('http://github.com', allow_redirects = True)
print(r.url)
print(r.status_code)
print(r.history)

输出:
https://github.com/
200
[<Response [301]>]
  • 代理

Beautiful Soup(用DOM方式处理)
是一个可以从HTML或XML文件中提取数据的Python库。他可以通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式。
历史版本已经停止开发,当前版本为4.2。(所以要装4以上的版本)
- 格式化后浏览数据

test.html:

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>

Python代码:

from bs4 import BeautifulSoup
soup = BeautifulSoup(open('test.html'))
print(soup.prettify())

输出:
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title" name="dromouse">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    <!-- Elsie -->
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>
  • 访问Tag
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('test.html'))
print(type(soup.title))
print(soup.title.name)
print(soup.title)
print(soup.title.string)  # String


输出:
<class 'bs4.element.Tag'>
title
<title>The Dormouse's story</title>
The Dormouse's story

String & Comment

print(type(soup.title.string))
print(soup.title.string)
print(type(soup.a.string))
print(soup.a.string)


<class 'bs4.element.NavigableString'>
The Dormouse's story
<class 'bs4.element.Comment'>
 Elsie 
  • 访问属性
  • 获取文本
  • 注释处理
  • 搜索
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('test.html'))
for item in soup.body.contents:
    print(item.name)

输出:
None
p
None
p
None
p
  • css选择器
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('test.html'))
print(soup.select('.sister'))
print(soup.select('#link1'))
print(soup.select('head > title'))

输出:
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]


[<title>The Dormouse's story</title>]

HTMLParser(用SAX方式处理)
需要用到markupbase包,但是pip安装不能成功
先pip search markupbase,然后网上下载,
下载完后把里面名为 “_markupbase.py”的文件改名为“markupbase.py”,再放到Python安装目录/Lib/site-packages里面

from HTMLParser import HTMLParser
class MyParser(HTMLParser):
    def handle_decl(self, decl):
        HTMLParser.handle_decl(self, decl)
        print('decl %s'%decl)

    def handle_starttag(self, tag, attrs):
        HTMLParser.handle_starttag(self, tag, attrs)
        print('<' + tag + '>')

    def handle_endtag(self, tag):
        HTMLParser.handle_endtag(self, tag)
        print('</' + tag + '>')

    def handle_data(self, data):
        HTMLParser.handle_data(self, data)
        print('data %s'%data)


    def handle_startendtag(self, tag, attrs):
        HTMLParser.handle_startendtag(self, tag, attrs)

    def handle_comment(self, data):
        HTMLParser.handle_comment(self, data)
        print('data %s'%data)
    def close(self):
        HTMLParser.close(self)
        print('Close')

demo = MyParser()
demo.feed(open('sample.html').read())
demo.close()

输出:


<html>
data  

<head>
data 

<meta>
</meta>
data 

<title>
data Rollen Holt - cnblogs
</title>
data 

<meta>
</meta>
data 

<link>
</link>
data 

<link>
</link>
data 

<link>
</link>
data 

<link>
</link>
data 

<link>
</link>
data 

<link>
</link>
data 

<link>
</link>
data 

<link>
</link>
data  

<script>
</script>
data   

<script>
</script>
data 

<script>
</script>
data 

<script>
</script>
data 

<script>
</script>
data 

</head>
data 

<body>
data 

<a>
</a>
data 

<form>
data 

<div>
data 

<input>
</input>
data 

</div>
data 

</form>
data 

</body>
data 

</html>
Close

sqlite

import sqlite3
conn = sqlite3.connect('test.db')
create_sql = 'create table company(id int primary key not null, emp_name text not null)'
conn.execute(create_sql)

insert_sql = 'insert into company values(?, ?)'  # 防止注入攻击,拼字符串容易注入攻击
conn.execute(insert_sql, (100, 'LY'))
conn.execute(insert_sql, (200, 'July'))
cursors = conn.execute('select id, emp_name from company')
for row in cursors:
    print(row[0], row[1])
conn.close()

输出:

100 LY
200 July

注意:
mysql: host(ip/port), username, password,
再用mysql进行插入后,一定要在插入的语句后面加上:
conn.commit(),否则无效果

实战登录豆瓣爬取网页内容

import requests
import html5lib
import re
from bs4 import BeautifulSoup

s = requests.Session()
url_login = 'http://accounts.douban.com/login'
url_contacts = 'htps://www.douban.com/people/****/contacts'

formdata = {
    'redir': 'https://www.douban.com',
    'form_email': 'xxx',# 此处换成自己的email
    'form_password': 'xxx', # 此处换成自己的密码
    'login':u'登录'
}

headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
}

r = s.post(url_login, data = formdata, headers = headers)

content = r.text
soup = BeautifulSoup(content, 'html5lib')
captcha = soup.find('img', id = 'captcha_image')
if captcha:
    captcha_url = captcha['src']
    re_captcha_id = r'<input type="hidden" name="captcha-id" value="(.*?)"/'
    captcha_id = re.findall(re_captcha_id, content)
    print(captcha_id)
    print(captcha_url)
    captcha_text = input('Please input the captcha: ')
    formdata['captcha-solution'] = captcha_text
    formdata['captcha-id'] = captcha_id
    r = s.post(url_login, data = formdata, headers = headers)

# r = s.get(url_contacts)
with open('contacts.txt', 'w+', encoding = 'utf-8') as f:
    f.write(r.text)

实战爬取豆瓣电影Top250 名称等信息

import requests
from lxml import etree

s = requests.Session()
for id in range(0, 251, 25):
    url = 'https://movie.douban.com/top250/?start-'+str(id)
    r = s.get(url)
    r.encoding = 'utf-8'
    root = etree.HTML(r.content)
    items = root.xpath('//ol/li/div[@class="item"]')
    print(len(items))

    for item in items:
        title = item.xpath('./div[@class="info"]//a/span[@class="title"]/text()')
        name = title[0].encode('gb2312', 'ignore').decode('gb2312')
        rank = item.xpath('./div[@class="pic"]/em/text()')[0]
        rating = item.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0]
        print(name,rank, rating)

import requests

headers = {‘User_Agent’:’Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36’}
cookies ={‘xxx’} # 此处换成你自己的cookie
url = ‘http://www.douban.com
r = requests.get(url, cookies = cookies, headers = headers)
with open(‘douban_2.txt’, ‘wb+’) as f:
f.write(r.content)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值