# urlencode应用场景:多个参数的时候
# https://www.baidu.com/s?wd=周杰伦&sex=男
# import urllib.parse
# data = {
# 'wd':'周杰伦',
# 'sex':'男',
# }
#
# a = urllib.parse.urlencode(data)
#
# print(a)
# 获取https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7的网页源码
import urllib.request
import urllib.parse
base_url = 'https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&localtion=%E4%B8%AD%E5%9B%BD%E5%8F%B0%E6%B9%BE%E7%9C%81'
headers = {
#'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'Accept-Encoding': 'gzip, deflate, br, utf-8',
#'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,ar;q=0.5',
#'Cache-Control': 'max-age=0',
#'Connection': 'keep-alive',
'Cookie':'BIDUPSID=0B56D4240CE10054D580E87426E4BC21; PSTM=1709189838; BAIDUID=7F910C31350973D80053F08F4D940583:FG=1; H_WISE_SIDS=60237_60273_60327_60336; H_WISE_SIDS_BFESS=60237_60273_60327_60336; BDUSS=nAwcXhDRXZISkRpS3lRbnFGOERnYjltNVc4aHpnNjBkQ3M1SUpPUm1INHdjZ2huRVFBQUFBJCQAAAAAAAAAAAEAAABtmLlcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADDl4GYw5eBmN; BDUSS_BFESS=nAwcXhDRXZISkRpS3lRbnFGOERnYjltNVc4aHpnNjBkQ3M1SUpPUm1INHdjZ2huRVFBQUFBJCQAAAAAAAAAAAEAAABtmLlcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADDl4GYw5eBmN; MCITY=-214%3A; BD_UPN=12314753; H_PS_PSSID=60273_60853_60886_60875_60898; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=7F910C31350973D80053F08F4D940583:FG=1; B64_BOT=1; RT="z=1&dm=baidu.com&si=357ff16f-8f88-40da-a603-13963079ed7c&ss=m2heo1nr&sl=4&tt=3s8&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&r=cey5vtq&ul=649d&hd=64i9"; ZD_ENTRY=bing; ab_sr=1.0.1_Yjg1NThkOTU2ODA3YjBiYjBhY2ZlOWVkMGFjZDExYTEyY2FjYzllNzM5MjAyM2U4NWZmNWZmZGNmYmMyYWNhZjZkOGQ5NzljNjdlYTA3OTg3ZTkwNzYxYzE0ZjgyMTQ3YTY1ZjE2ZjQ2MGRhOGEzMGY3NTJjZDE1NDM5OTc2YjcwMDlhYjJlN2IwMDY5Y2E3YmEzNjg2MmFlZjM3MDEzOWZmNDZiZTRiMjg5Yzk5ZjJhMWQxNTc4YjljMGRiMmU2; delPer=0; BD_CK_SAM=1; PSINO=1; H_PS_645EC=e7f6rj9kbJpggQiBTCqlwH8%2BR%2BkPk%2FU8qWgAZB6SJwDGkDy5RDx8Q9iQYIg; BA_HECTOR=2ga0800hag2k2l2h000ha1a1araod61jh9p741v; ZFY=oCD7Di5sJxOWCfX9rwp4QUQB52eBhP5VUSLyOukGY:Ao:C; baikeVisitId=9f3f5c44-486a-4b32-8731-5f80bbc4ae7d',
#'Host': 'www.baidu.com' ,
#'Sec-Fetch-Dest': 'document' ,
#'Sec-Fetch-Mode': 'navigate',
#'Sec-Fetch-Site': 'none',
#'Sec-Fetch-User': '?1',
#'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36 Edg/122.0.0.0',
#'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Microsoft Edge";v="122"',
#'sec-ch-ua-mobile': '?1',
#'sec-ch-ua-platform': '"Android"'
}
request = urllib.request.Request(url=base_url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
如果爬取网页的话,出现了百度安全认证,需要在User-Agent的基础下加上Cookie,这个同样也能在浏览器的检查里的network能够找到
1747

被折叠的 条评论
为什么被折叠?



