Python爬虫请求模块介绍

Python爬虫:请求模块介绍

urllib.request请求模块例题:

接下来以爬取百度贴吧为例进行演示讲解:

1、导入包:

import urllib.request

2、分析ip地址以《美女吧》为例

#第一页网页https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=0
#第二页网页https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=50
#分析可知:共有的部分是https://tieba.baidu.com/f?    &pn=
#第一页pn值为0. 第二页pn值为50
#所有pn = (i- 1) * 50

urlbase = 'https://tieba.baidu.com/f?'

#kw=%E7%BE%8E%E5%A5%B3是美女的编码以三个%为一个汉字可以采用字典储存
name = input('请输入要搜索的名字')
kw = {'kw':'name'}
#将kw进行编译
#导入urllib.parse包
import urllib.parse
kw = urllib.parse.urlencode(kw)

#获取浏览的headers
headers = {
            'Uers-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'
}

#进行ip地址的组装

start = input("请输入开始页:")
end = input("请输入终止页:")

for i in range(star, end+1):
    pn = (i - 1) * 50
    #获取最终的url ip值
    url = baseurl + kw + '&pn=' + str(pn)
    
    #获取响应值
    req = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(req)
    #将respond转换为‘utf-8’形式的文本
    html = response.read().decode('utf-8')
    
    #将爬取到的数据储存在文件中
    filename = 'E:\美女吧\第'+str(i)+'页.html'
    
    with open(filename, 'w', encoding='utf-8') as f:
        print('正在爬取第%d页'%i)
        f.write(html)
        f.close

完整代码为:

class BaiduSpider():
    def __init__(self):
        self.headers = {
            'Uers-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'
        }
        self.baseurl = 'https://tieba.baidu.com/f?'

    def readPage(self, url):
        req = urllib.request.Request(url, headers=self.headers)
        response = urllib.request.urlopen(req)
        html = response.read().decode('utf-8')
        return html

    def writePage(self, filename, html):
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(html)
            f.close()

    def main(self):
        name = input('请输入要搜索的数据:')
        start = eval(input('请输入开始页:'))
        end = eval(input('请输入终止页:'))
        kw = {'kw': name}
        kw = urllib.parse.urlencode(kw)
        for i in range(start, end + 1):
            url = self.baseurl + kw + '&pn=' + str((i - 1) * 50)
            html = self.readPage(url)
            filename = 'E:\作品\Python\爬虫-网页\贴吧-妹子\class版\妹子第' + str(i) + '页.html'
            print('正在爬取第%d页'%i)
            self.writePage(filename, html)

if __name__ == '__main__':
    spider = BaiduSpider()
    spider.main()

requests请求模块

接下来还是以百度贴吧为例介绍requests模块

#导入requests包,requests是第三方库所以需要下载,在cmd下通过pip install requests 可以下载
import requests


#分析url
#第一页网页https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=0
#第二页网页https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=50
#分析可知:共有的部分是https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3        &pn=
#第一页pn值为0. 第二页pn值为50
#所有pn = (i- 1) * 50

baseurl = 'https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3'

headers = {
    'Uers-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    'Cookie': 'BIGipServerpool_index=804258314.43286.0000; RAIL_EXPIRATION=1588587318843; RAIL_DEVICEID=ZgwnwnZO9_yBuDvgQ2tYUeoY8HArzTO0jVYH7i1HV7joCIruN9FKsxWFduaZnJcVyRi6LO4kgvPHPO9AKEUxL36_vel-lhEO-XUkLq81sctFLQJy2vKZYjEBfjaS_ClMXDsBAWPQrzzfQ7t3ZM6ltHMtGuopnE5m; route=c5c62a339e7744272a54643b3be5bf64; BIGipServerotn=233832970.50210.0000'
}

start = eval(input("请输入开始页:"))
end = eval(input("请输入终止页:"))
for i in range(start, end + 1):
    url = baseurl+'&pn='+str((i - 1) * 50)
    response = requests.get(url, headers=headers)
    html = response.content.decode('utf-8')
    print(html)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值