使用Requests与BeautifulSoup进行网页爬取-优快云博客

一、requests模块GET请求
1. 无参实例
import requests
ret = requests.get('http://www.autohome.com.cn/news/')
ret.encoding='gbk'    #改成中文编码
print(ret.url)    #打印url
print(ret.text)    #打印文本

2.有参实例
import requests
params_dic = {
    'hostid': '10107',
    'elementid': '23',
    'screen': '1',
    'name': '10.28.142.240'
}

ret = requests.get(url='http://zabbix.test.com/screens.php', params=params_dic)
print(ret.url)
print(ret.text)


二、requests模块POST请求
1.post基本用法
import requests
data = {
    'hostid': '10107',
    'elementid': '23',
    'screen': '1',
    'name': '10.28.142.240'
}

ret = requests.post(' data=data)
print(ret.text)

2.post传数据
import requests
from bs4 import BeautifulSoup
url = 'http://zabbix.kuaikuaidai.com:8888/index.php'
ret = requests.get(url)

soup = BeautifulSoup(ret.text, 'html.parser')
tag = soup.find(attrs={'name': 'sid'})
sid=tag.attrs['value']

tag1 = soup.find(attrs={'name': 'form_refresh'})
form_refresh = tag1.attrs['value']


data = {
    'sid': sid,
    'form_refresh': form_refresh,
    'name': 'admin',
    'password': 'kkdai123',
    'autologin': '1',
    'enter': 'Sign in'
}

headers = {'content-type': 'text/html'}

zabbix_ret = requests.post(url,data=data, headers=headers)

print(zabbix_ret.status_code)
print(zabbix_ret.text)


3.其他请求
requests.get(url, params=None, **kwargs)
requests.post(url, data=None, json=None, **kwargs)
requests.put(url, data=None, **kwargs)
requests.head(url, **kwargs)
requests.delete(url, **kwargs)
requests.patch(url, data=None, **kwargs)
requests.options(url, **kwargs)

注: 以上方法都是通过requests.request(method, url, **kwargs)构建


三、BeautifulSoup模块解析html文档
#获取汽车之家新闻页面
auto_home = requests.get(url='http://www.autohome.com.cn/news/')
auto_home.encoding='gbk'
auto_home_news = auto_home.text

soup = BeautifulSoup(auto_home_news, 'html.parser')    #返回文档结构对象
tag = soup.find(name='h3')    #找到第一个h3标签
print(tag.name)    #打印标签名

1.name标签
tag = soup.find(name='h3')
name = tag.name
tag.name = 'h4'    #设置name

2.attr标签属性
print(tag.attrs)    #获取标签属性
tag.attrs['id'] = '123'    #设置标签属性

3.children所有子标签
tag = soup.find(name='body')
v = tag.children
print(v)

4.descendants所有子孙标签
tag = soup.find(name='body')
v = tag.descendants
print(v)

5.clear清空所有子孙标签
tag = soup.find(name='body')
tag.clear()
print(tag)

6.decompose递归删除所有标签
tag = soup.find(name='body')
tag.decompose()
print(tag)

7.extract递归删除所有标签并返回删除的标签
tag = soup.find(name='body')
v = tag.extract()
print(v)

8.decode,转换为字符串（含当前标签）；decode_contents（不含当前标签）
tag = soup.find(name='body')
v = tag.decode()
v1 = tag.decode_contents()
print(v)

9.encode,转换为字节（含当前标签）；encode_contents（不含当前标签）
tag = soup.find(name='body')
v = tag.encode()
v1 = tag.encode_contents()
print(v)

10.find获取匹配的第一个标签
tag = soup.find(name='body')

11.find_all获取匹配的所有标签
tag = soup.find_all(name='h3')

12.has_attr判断标签是否有某个属性
tag = soup.find(name='h3')
v = tag.has_attr('id')
print(v)

13.get_text获取标签文本内容
tag = soup.find(name='h3')
v = tag.get_text()
print(v)

14.is_empty_element,是否是空标签(是否可以是空)或者自闭合标签
# tag = soup.find('br')
# v = tag.is_empty_element
# print(v)

15.当前的关联标签
# soup.next
# soup.next_element
# soup.next_elements
# soup.next_sibling
# soup.next_siblings
 
#
# tag.previous
# tag.previous_element
# tag.previous_elements
# tag.previous_sibling
# tag.previous_siblings
 
#
# tag.parent
# tag.parents

16.查找某标签的关联标签
# tag.find_next(...)
# tag.find_all_next(...)
# tag.find_next_sibling(...)
# tag.find_next_siblings(...)
 
# tag.find_previous(...)
# tag.find_all_previous(...)
# tag.find_previous_sibling(...)
# tag.find_previous_siblings(...)
 
# tag.find_parent(...)
# tag.find_parents(...)
 
# 参数同find_all

17.string获取修改标签内容
tag = soup.find(name='h3')
tag.string = 'new content'    #设置标签内容
v = tag.string
print(v)

18.append在当前标签内部追加一个标签
# tag = soup.find('body')
# tag.append(soup.find('a'))
# print(soup)
#
# from bs4.element import Tag
# obj = Tag(name='i',attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.append(obj)
# print(soup)

19.insert在当前标签内部指定位置插入一个标签
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.insert(2, obj)
# print(soup)



登录抽屉新热榜点赞
方法一
import requests
i1 = requests.get(url='http://dig.chouti.com/')
i1_cookies = i1.cookies.get_dict()
#print(i1_cookies)

i2 = requests.post(
    url="http://dig.chouti.com/login",
    data={
        'phone': '86xxxxxxxxx',
        'password': 'xxxxxxx',
        'oneMonth': "",
    },
    cookies=i1_cookies
)

gpsd = i1_cookies['gpsd']
i3 = requests.post(
    url="http://dig.chouti.com/link/vote?linksId=14723416",
    cookies={'gpsd': gpsd}
)

print(i3.text)


方法二
import requests

headers = {
    "Host": "dig.chouti.com",
    "Referer": "http://dig.chouti.com/",
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0',
    'X-Requested-With': 'XMLHttpRequest'
}
session = requests.Session()
i1 = session.get(url="http://dig.chouti.com/",headers=headers)
i2 = session.post(
    url="http://dig.chouti.com/login",
    data={
        'phone': "8615726697022",
        'password': 'xxxxxxx',
        'oneMonth': ""
    },
    headers=headers
)

i3 = session.post(
    url="http://dig.chouti.com/link/vote?linksId=14723416",headers=headers
)
print(i3.text)


登录github账号显示项目列表
import requests
from bs4 import BeautifulSoup

i1 = requests.get('https://github.com/login')
soup1 = BeautifulSoup(i1.text, features='lxml')
tag = soup1.find(name='input', attrs={'name': 'authenticity_token'})
authenticity_token = tag.get('value')
c1 = i1.cookies.get_dict()
i1.close()

form_data = {
    "authenticity_token": authenticity_token,
    'utf8': "",
    'commit': "Sign in",
    'login': '758109577@qq.com',
    'password': 'xxxxxxxx'
}

i2 = requests.post('https://github.com/session', data=form_data, cookies=c1)
c2 = i2.cookies.get_dict()
c1.update(c2)
i3 = requests.get('https://github.com/settings/repositories', cookies=c1)

soup3 = BeautifulSoup(i3.text, features='lxml')
list_group = soup3.find(name='div', class_='listgroup')

from bs4.element import Tag
for child in list_group.children:
    if isinstance(child, Tag):
        project_tag = child.find(name='a', class_='mr-1')
        size_tag = child.find(name='small')
        temp = "项目: %s(%s); 项目路径: %s" % (project_tag.get('href'), size_tag.string, project_tag.string, )
        print(temp)
        
方法二
import requests
from bs4 import BeautifulSoup

session = requests.Session()
i1 = session.get('https://github.com/login')
soup1 = BeautifulSoup(i1.text,features='lxml')
tag = soup1.find(name='input', attrs={'name': 'authenticity_token'})
authenticity_token = tag.get('value')
c1 = i1.cookies.get_dict()
i1.close()

form_data = {
    "authenticity_token": authenticity_token,
    'utf8': "",
    'commit': "Sign in",
    'login': '758109577@qq.com',
    'password': 'xxxxxxxxxx'
}

i2 = session.post('https://github.com/session', data=form_data)
c2 = i2.cookies.get_dict()
c1.update(c2)
i3 = session.get('https://github.com/settings/repositories')

soup3 = BeautifulSoup(i3.text, features='lxml')
list_group = soup3.find(name='div', class_='listgroup')

from bs4.element import Tag

for child in list_group.children:
    if isinstance(child, Tag):
        project_tag = child.find(name='a', class_='mr-1')
        size_tag = child.find(name='small')
        temp = "项目: %s(%s); 项目路径: %s" % (project_tag.get('href'), size_tag.string, project_tag.string, )
        print(temp)
        
        
知乎登录
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import time

import requests
from bs4 import BeautifulSoup

session = requests.Session()

i1 = session.get(
    url='https://www.zhihu.com/#signin',
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
    }
)

soup1 = BeautifulSoup(i1.text, 'lxml')
xsrf_tag = soup1.find(name='input', attrs={'name': '_xsrf'})
xsrf = xsrf_tag.get('value')

current_time = time.time()
i2 = session.get(
    url='https://www.zhihu.com/captcha.gif',
    params={'r': current_time, 'type': 'login'},
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
    })

with open('zhihu.gif', 'wb') as f:
    f.write(i2.content)

captcha = input('请打开zhihu.gif文件，查看并输入验证码：')
form_data = {
    "_xsrf": xsrf,
    'password': 'xxxxxx',
    "captcha": captcha,
    'email': '7xxxxx@qq.com'
}
i3 = session.post(
    url='https://www.zhihu.com/login/email',
    data=form_data,
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
    }
)

i4 = session.get(
    url='https://www.zhihu.com/settings/profile',
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
    }
)

soup4 = BeautifulSoup(i4.text, 'lxml')
tag = soup4.find(id='rename-section')
nick_name = tag.find('span',class_='name').string
print(nick_name)
转载于:https://blog.51cto.com/haoyonghui/1972754