Python爬虫:请求模块介绍
urllib.request请求模块例题:
接下来以爬取百度贴吧为例进行演示讲解:
1、导入包:
import urllib.request
2、分析ip地址以《美女吧》为例
#第一页网页https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=0 #第二页网页https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=50 #分析可知:共有的部分是https://tieba.baidu.com/f? &pn= #第一页pn值为0. 第二页pn值为50 #所有pn = (i- 1) * 50 urlbase = 'https://tieba.baidu.com/f?' #kw=%E7%BE%8E%E5%A5%B3是美女的编码以三个%为一个汉字可以采用字典储存 name = input('请输入要搜索的名字') kw = {'kw':'name'} #将kw进行编译 #导入urllib.parse包 import urllib.parse kw = urllib.parse.urlencode(kw) #获取浏览的headers headers = { 'Uers-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36' } #进行ip地址的组装 start = input("请输入开始页:") end = input("请输入终止页:") for i in range(star, end+1): pn = (i - 1) * 50 #获取最终的url ip值 url = baseurl + kw + '&pn=' + str(pn) #获取响应值 req = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(req) #将respond转换为‘utf-8’形式的文本 html = response.read().decode('utf-8') #将爬取到的数据储存在文件中 filename = 'E:\美女吧\第'+str(i)+'页.html' with open(filename, 'w', encoding='utf-8') as f: print('正在爬取第%d页'%i) f.write(html) f.close
完整代码为:
class BaiduSpider(): def __init__(self): self.headers = { 'Uers-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36' } self.baseurl = 'https://tieba.baidu.com/f?' def readPage(self, url): req = urllib.request.Request(url, headers=self.headers) response = urllib.request.urlopen(req) html = response.read().decode('utf-8') return html def writePage(self, filename, html): with open(filename, 'w', encoding='utf-8') as f: f.write(html) f.close() def main(self): name = input('请输入要搜索的数据:') start = eval(input('请输入开始页:')) end = eval(input('请输入终止页:')) kw = {'kw': name} kw = urllib.parse.urlencode(kw) for i in range(start, end + 1): url = self.baseurl + kw + '&pn=' + str((i - 1) * 50) html = self.readPage(url) filename = 'E:\作品\Python\爬虫-网页\贴吧-妹子\class版\妹子第' + str(i) + '页.html' print('正在爬取第%d页'%i) self.writePage(filename, html) if __name__ == '__main__': spider = BaiduSpider() spider.main()
requests请求模块
接下来还是以百度贴吧为例介绍requests模块
#导入requests包,requests是第三方库所以需要下载,在cmd下通过pip install requests 可以下载
import requests
#分析url
#第一页网页https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=0
#第二页网页https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=50
#分析可知:共有的部分是https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3 &pn=
#第一页pn值为0. 第二页pn值为50
#所有pn = (i- 1) * 50
baseurl = 'https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3'
headers = {
'Uers-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'Cookie': 'BIGipServerpool_index=804258314.43286.0000; RAIL_EXPIRATION=1588587318843; RAIL_DEVICEID=ZgwnwnZO9_yBuDvgQ2tYUeoY8HArzTO0jVYH7i1HV7joCIruN9FKsxWFduaZnJcVyRi6LO4kgvPHPO9AKEUxL36_vel-lhEO-XUkLq81sctFLQJy2vKZYjEBfjaS_ClMXDsBAWPQrzzfQ7t3ZM6ltHMtGuopnE5m; route=c5c62a339e7744272a54643b3be5bf64; BIGipServerotn=233832970.50210.0000'
}
start = eval(input("请输入开始页:"))
end = eval(input("请输入终止页:"))
for i in range(start, end + 1):
url = baseurl+'&pn='+str((i - 1) * 50)
response = requests.get(url, headers=headers)
html = response.content.decode('utf-8')
print(html)