目录
urllib:
python内置的http请求库
4个模块:
request模块:最基本的http请求模块
error模块:异常处理模块
parse模块:工具模块 提供url的处理方法
robotparser模块:识别robot.txt
最基本的请求:
使用openurl发送
get请求:
带参数直接在url上面拼接
参数可能不止一个
多参数:urllib.parse.urlencode(params)
基本的URL地址和一个包含查询参数的字典params
使用urllib.parse.urlencode()函数将查询参数编码为查询字符串
post请求:
添加data(可选)
爬取一个百度网页
#!/usr/bin/env python3
import urllib.request
def load_baidu_data():
url = 'http://www.baidu.com'
response =urllib.request.urlopen(url)
data=response.read()
str_data = data.decode('utf-8')
with open("bd.html","w",encoding='utf-8') as f:
f.write(str_data)
load_baidu_data()
get请求拼接
代码实现百度搜索柯南
#!/usr/bin/env python3
import urllib.request
import urllib.parse
import string
def load_baidu_data():
url = 'http://www.baidu.com/s?wd='
#url+search
name = "柯南"
final_url = url+name
#网址中有汉字,需要进行转码
encode_url = urllib.parse.quote(final_url,safe=string.printable)
# print(encode_url)
response = urllib.request.urlopen(encode_url)
data = response.read()
#将data获取到的东西转换为字符串
str_data =data.decode('utf-8')
# print(str_data)
with open("baidu-kenan.html","w",encoding="utf-8") as f:
f.write(str_data)
load_baidu_data()

get请求拼接多个参数
代码实现百度搜索柯南第九页,和和上一个步骤一样先观察url

#!/usr/bin/env python3
import urllib.request
import urllib.parse
import string
def load_baidu_data():
url = 'http://www.baidu.com/s?'
#url+search
params ={
"wd":"柯南",
"pn":"80"
}
query_str = urllib.parse.urlencode(params)
final_url = url+query_str
print(final_url)
response = urllib.request.urlopen(final_url)
data = response.read()
#将data获取到的东西转换为字符串
str_data =data.decode('utf-8')
# print(str_data)
with open("baidu-kenan-pn80.html","w",encoding="utf-8") as f:
f.write(str_data)
load_baidu_data()
成功

post请求(data,timeout):
#!/usr/bin/env python
import urllib.request
import urllib.parse
import urllib.error
#1.定义url(自己找一个url)
url = 'http://www.baidu.com/post'
#创建要发送的数据表单
data = {
'hello':'world',
'name':'kenan'
}
#data要进行编码
data_encode = urllib.parse.urlencode(data).encode("utf-8")
#加上encode(“utf-8”) str -》bytes
#解码 decode("utf-8") byte -》str
try:
response = urllib.request.urlopen(url=url,data=data_encode,timeout=0.1)
print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
print("连接超时!")
User-Agent
自定义User-Agent
#!/usr/bin/env python
import urllib.request
import urllib.parse
import urllib.error
#1.定义url
url = 'http://www.baidu.com/post'
#2自定义request 添加一个user-agent
header = {
"User-Agent":"Mozilla/5.0 (Linux; U; Android 11; zh-CN; 2112123AC Build/RKQ1.200826.002) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/100.0.4896.58 Quark/6.2.2.246 Mobile Safari/537.36"
}
req = urllib.request.Request(url=url,headers=header,method='POST')
response = url

最低0.47元/天 解锁文章
270

被折叠的 条评论
为什么被折叠?



