requests
警告:非专业使用其他HTTP库会导致危险的副作用,包括:安全缺陷症、冗余代码症、重新发明轮子症、啃文档症等
Requests允许你发送纯天然的HTTP/1.1请求,不需要手动为URL添加查询字串,也不需要对POST数据进行表单编码。Keep-alive和HTTP连接池的功能是100%自动化的,一切动力都来自于根植Requests内部的urllib3。
- 发送请求
import requests
url = 'http://www.baidu.com'
r= requests.get(url)
print(r.status_code)
print(r.text)
- 传递参数
如http://xxx?aa=bb&cc=dd
import requests
url = 'http://httpbin.org/get'
params = {'k1':'v1', 'k2':'v2'}
r= requests.get(url,params)
print(r.url)
输出:
http://httpbin.org/get?k1=v1&k2=v2
若'k2'后为数组:
即:
params = {'k1':'v1', 'k2':[1,2,3]}
则输出:
http://httpbin.org/get?k1=v1&k2=1&k2=2&k2=3
若为None,则不显示,即:
params = {'k1':'v1', 'k2':None}
http://httpbin.org/get?k1=v1
- 二进制数据处理
import requests
from io import BytesIO
from PIL import Image
r = requests.get('https://ss1.bdstatic.com/70cFvXSh_Q1YnxGkpoWK1HF6hhy/it/u=3429926207,375734078&fm=117&gp=0.jpg', stream = True)
image = Image.open(BytesIO(r.content))
image.save('meinv.jpg')
- json数据处理
import requests
r = requests.get('https://github.com/timeline.json')
print(type(r.json))
print(r.text)
- 原始数据处理
import requests
r = requests.get('https://ss1.bdstatic.com/70cFvXSh_Q1YnxGkpoWK1HF6hhy/it/u=3429926207,375734078&fm=117&gp=0.jpg', stream = True)
with open('meinv3.jpg', 'wb+') as f:
for chunk in r.iter_content(1024):
f.write(chunk)
- 提交表单
import requests
form = {'username':'user', 'password':'pass'}
r = requests.post('http://httpbin.org/post', data = form)
print(r.text)
或者换为:
r = requests.post('http://httpbin.org/post', data = json.dumps(form))
- cookie
url = 'http://www.baidu.com'
import requests
r = requests.get(url)
cookies = r.cookies
for k,v in cookies.get_dict().items():
print(k,v)
输出:
BDORZ 27315
import requests
cookies = {'c1':'v1', 'c2':'v2'}
r = requests.get('http://httpbin.org/cookies', cookies = cookies)
print(r.text)
输出:
{
"cookies": {
"c1": "v1",
"c2": "v2"
}
}
- 重定向和重定向历史
现在很多访问http的网站给你重定向到https的站点
import requests
r = requests.head('http://github.com', allow_redirects = True)
print(r.url)
print(r.status_code)
print(r.history)
输出:
https://github.com/
200
[<Response [301]>]
- 代理
Beautiful Soup(用DOM方式处理)
是一个可以从HTML或XML文件中提取数据的Python库。他可以通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式。
历史版本已经停止开发,当前版本为4.2。(所以要装4以上的版本)
- 格式化后浏览数据
test.html:
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
Python代码:
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('test.html'))
print(soup.prettify())
输出:
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title" name="dromouse">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
<!-- Elsie -->
</a>
,
<a class="sister" href="http://example.com/lacie" id="link2">
Lacie
</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">
Tillie
</a>
;
and they lived at the bottom of a well.
</p>
<p class="story">
...
</p>
</body>
</html>
- 访问Tag
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('test.html'))
print(type(soup.title))
print(soup.title.name)
print(soup.title)
print(soup.title.string) # String
输出:
<class 'bs4.element.Tag'>
title
<title>The Dormouse's story</title>
The Dormouse's story
String & Comment
print(type(soup.title.string))
print(soup.title.string)
print(type(soup.a.string))
print(soup.a.string)
<class 'bs4.element.NavigableString'>
The Dormouse's story
<class 'bs4.element.Comment'>
Elsie
- 访问属性
- 获取文本
- 注释处理
- 搜索
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('test.html'))
for item in soup.body.contents:
print(item.name)
输出:
None
p
None
p
None
p
- css选择器
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('test.html'))
print(soup.select('.sister'))
print(soup.select('#link1'))
print(soup.select('head > title'))
输出:
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
[<title>The Dormouse's story</title>]
HTMLParser(用SAX方式处理)
需要用到markupbase包,但是pip安装不能成功
先pip search markupbase,然后网上下载,
下载完后把里面名为 “_markupbase.py”的文件改名为“markupbase.py”,再放到Python安装目录/Lib/site-packages里面
from HTMLParser import HTMLParser
class MyParser(HTMLParser):
def handle_decl(self, decl):
HTMLParser.handle_decl(self, decl)
print('decl %s'%decl)
def handle_starttag(self, tag, attrs):
HTMLParser.handle_starttag(self, tag, attrs)
print('<' + tag + '>')
def handle_endtag(self, tag):
HTMLParser.handle_endtag(self, tag)
print('</' + tag + '>')
def handle_data(self, data):
HTMLParser.handle_data(self, data)
print('data %s'%data)
def handle_startendtag(self, tag, attrs):
HTMLParser.handle_startendtag(self, tag, attrs)
def handle_comment(self, data):
HTMLParser.handle_comment(self, data)
print('data %s'%data)
def close(self):
HTMLParser.close(self)
print('Close')
demo = MyParser()
demo.feed(open('sample.html').read())
demo.close()
输出:
<html>
data
<head>
data
<meta>
</meta>
data
<title>
data Rollen Holt - cnblogs
</title>
data
<meta>
</meta>
data
<link>
</link>
data
<link>
</link>
data
<link>
</link>
data
<link>
</link>
data
<link>
</link>
data
<link>
</link>
data
<link>
</link>
data
<link>
</link>
data
<script>
</script>
data
<script>
</script>
data
<script>
</script>
data
<script>
</script>
data
<script>
</script>
data
</head>
data
<body>
data
<a>
</a>
data
<form>
data
<div>
data
<input>
</input>
data
</div>
data
</form>
data
</body>
data
</html>
Close
sqlite
import sqlite3
conn = sqlite3.connect('test.db')
create_sql = 'create table company(id int primary key not null, emp_name text not null)'
conn.execute(create_sql)
insert_sql = 'insert into company values(?, ?)' # 防止注入攻击,拼字符串容易注入攻击
conn.execute(insert_sql, (100, 'LY'))
conn.execute(insert_sql, (200, 'July'))
cursors = conn.execute('select id, emp_name from company')
for row in cursors:
print(row[0], row[1])
conn.close()
输出:
100 LY
200 July
注意:
mysql: host(ip/port), username, password,
再用mysql进行插入后,一定要在插入的语句后面加上:
conn.commit(),否则无效果
实战登录豆瓣爬取网页内容
import requests
import html5lib
import re
from bs4 import BeautifulSoup
s = requests.Session()
url_login = 'http://accounts.douban.com/login'
url_contacts = 'htps://www.douban.com/people/****/contacts'
formdata = {
'redir': 'https://www.douban.com',
'form_email': 'xxx',# 此处换成自己的email
'form_password': 'xxx', # 此处换成自己的密码
'login':u'登录'
}
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
}
r = s.post(url_login, data = formdata, headers = headers)
content = r.text
soup = BeautifulSoup(content, 'html5lib')
captcha = soup.find('img', id = 'captcha_image')
if captcha:
captcha_url = captcha['src']
re_captcha_id = r'<input type="hidden" name="captcha-id" value="(.*?)"/'
captcha_id = re.findall(re_captcha_id, content)
print(captcha_id)
print(captcha_url)
captcha_text = input('Please input the captcha: ')
formdata['captcha-solution'] = captcha_text
formdata['captcha-id'] = captcha_id
r = s.post(url_login, data = formdata, headers = headers)
# r = s.get(url_contacts)
with open('contacts.txt', 'w+', encoding = 'utf-8') as f:
f.write(r.text)
实战爬取豆瓣电影Top250 名称等信息
import requests
from lxml import etree
s = requests.Session()
for id in range(0, 251, 25):
url = 'https://movie.douban.com/top250/?start-'+str(id)
r = s.get(url)
r.encoding = 'utf-8'
root = etree.HTML(r.content)
items = root.xpath('//ol/li/div[@class="item"]')
print(len(items))
for item in items:
title = item.xpath('./div[@class="info"]//a/span[@class="title"]/text()')
name = title[0].encode('gb2312', 'ignore').decode('gb2312')
rank = item.xpath('./div[@class="pic"]/em/text()')[0]
rating = item.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0]
print(name,rank, rating)
import requests
headers = {‘User_Agent’:’Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36’}
cookies ={‘xxx’} # 此处换成你自己的cookie
url = ‘http://www.douban.com’
r = requests.get(url, cookies = cookies, headers = headers)
with open(‘douban_2.txt’, ‘wb+’) as f:
f.write(r.content)