urllib的使用_ur.urlopen-优快云博客

本文链接：https://blog.youkuaiyun.com/NumOneDD/article/details/108261111
本文深入讲解了使用Python进行网络爬虫开发的技术细节，包括如何利用urllib库进行HTTP请求，处理URL编码和解码，使用代理IP和User-Agent绕过网站的反爬策略，以及如何解析网页数据。同时，还介绍了如何使用XPath和正则表达式来定位和提取所需信息。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
最重要的是掌握通过前端开发者工具编写好xpath语句后者正则表达式语句
import urllib.request as ur

ret = ur.urlopen('https://edu.youkuaiyun.com/') # 返回的是<class 'http.client.HTTPResponse'> 对象
ret = ur.urlopen('https://edu.youkuaiyun.com/').read() # 返回的是字节类型文件 <class 'bytes'>
print(type(ret))
# with open('edu.html','wb') as f:
#     f.write(ret)








上面和下面都是打开一个url地址，一般使用下面的方式封装一个request对象，更加灵活一些








import urllib.request as ur

request = ur.Request('https://edu.youkuaiyun.com/')
response = ur.urlopen(request).read()# 返回的是字节类型文件 <class 'bytes'>
print(response)
      











#urllibe中url的编码和解码以及request的使用方式

import urllib.parse as up
import urllib.request as ur



kw = '动漫'#定义一个关键字 下面这三个参数是根据具体的url在地址栏中获取的，不同的连接参数不一样
data = {
    'kw':kw,
    'ie':'utf-8',#定义编码方式
    'pn':'100' #页码
}
data_url = up.urlencode(data)#kw=%E5%8A%A8%E6%BC%AB&ie=utf-8&pn=100    编码
ret = up.unquote(data_url)#kw=动漫&ie=utf-8&pn=100                     解码

request = ur.Request('https://tieba.baidu.com/f?'+data_url)#get提交
print(type(request))#<class 'urllib.request.Request'>
response = ur.urlopen(request).read()
print(type(response))#<class 'bytes'>

# with open('%s.html' % kw,'wb') as f:
#     f.write(response)









'''
httprequest的post提交
'''

import urllib.request as ur
import urllib.parse as up
import json
word = input('请输入要翻译的中文')
data = {
    'kw':word
}
data_url = up.urlencode(data)   #先要进行编码，有可能输入的是中文，转成字符串格式
request = ur.Request(#post提交
    url='https://fanyi.baidu.com/sug',
    data=data_url.encode('utf-8'),# 转换称url可以解析的请求 将编码之后的内容转换成字节类型的，request存储的都是字节类型
)
response = ur.urlopen(request).read()#字节
print(type(response)) #<class 'bytes'>
ret = json.loads(response)
# print(ret)
translate = ret['data'][0]['v']
print(translate)





'''
    反爬策略之代理IP
'''

import urllib.request as ur
#下面这个ip可以去 http://www.data5u.com/ 注册然后使用
proxy_address = ur.urlopen('http://api.ip.data5u.com/dynamic/get.html?order=dfcdc65b3d8392e908af488390d03467&random=1&sep=4').read().decode('utf-8').strip()
# 创建proxy_handler 代理ip对象
proxy_handler = ur.ProxyHandler(
    {
        'http':proxy_address
    }
)
# 新建opener对象并用 创建proxy_handler进行封装
proxy_opener = ur.build_opener(proxy_handler)
#定义访问的地址
request = ur.Request(url='https://edu.youkuaiyun.com/')
#使用 用代理ip封装好的 opener对象 访问网址
reponse = proxy_opener.open(request).read()
print(reponse)




'''







'''



'''
伪装User-Agent绕过浏览器的反爬策略
添加cookie模拟登录
'''

import urllib.request as ur
import user_agent
import lxml.etree as le

request = ur.Request(
    url='https://edu.youkuaiyun.com/mycollege',
    headers={
        'User-Agent':user_agent.get_user_agent_pc(),
        'Cookie':'uuid_tt_dd=10_291046340-1549030766199-774346; smidV2=2019032808063205c4b158a28825bdfbb6108edc9a5f510091c3fec0aad6150; ADHOC_MEMBERSHIP_CLIENT_ID1.0=39699c49-111c-7665-4452-b270ee17f1b4; _ga=GA1.2.924224088.1556522733; UM_distinctid=16a6800cde7335-02ca3a55a6d7aa-3a614f0b-1fa400-16a6800cde9373; UN=kzl_knight; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_291046340-1549030766199-774346!5744*1*kzl_knight!1788*1*PC_VC; dc_session_id=10_1563242124060.407147; UserName=kzl_knight; UserInfo=87b6c178121e4ebb9c698b6c3bbcdcd9; UserToken=87b6c178121e4ebb9c698b6c3bbcdcd9; UserNick=kzl_knight; AU=5AA; BT=1564751420031; p_uid=U000000; Hm_ct_e5ef47b9f471504959267fd614d579cd=5744*1*kzl_knight!6525*1*10_291046340-1549030766199-774346; __yadk_uid=b07B7nB4DNxlZBhsPmydqZ8oixohVRvY; TY_SESSION_ID=df97a907-a50c-4bca-a926-4b9edd956598; cname11736=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1565085616,1565100208,1565143666,1565148547; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1565156088; dc_tos=pvuq60',
    }
)

response = ur.urlopen(request).read().decode('utf-8')
# with open('mycollege2.html','w',encoding='utf-8') as f:
#     f.write(response)

html_x = le.HTML(response)
title_s = html_x.xpath('//li[@class="item_box"]//h1/a/text()')
print(title_s)



'''







'''





#热门文章抓取

import urllib.request as ur
import lxml.etree as le
import re
import urllib.request as ur
import urllib.parse as up
url = 'https://so.youkuaiyun.com/so/search/s.do?p={page}&q={keyword}&t=blog&viparticle=&domain=&o=&s=&u=&l=&f=&rbg=0'


def getResponse(url):
    req = ur.Request(
        url=url,
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
        }
    )
    #字节对象  下方写入时使用  with open(filepath,'wb') as f:
    response = ur.urlopen(req).read()

    #字符对象  下方写入时使用  with open(filepath,'w',encoding='utf-8') as f:
    response = ur.urlopen(req).read().decode('utf-8','ignore')

    return response

if __name__ == '__main__':
    keyword = input('关键词')
    pn_start = int(input('起始页'))
    pn_end = int(input('终止页'))

    '''
        如果关键字中有中文则需要进行解码，编码成中文所对应的计算机编码格式中能识别的编码
         data = {
             'q':keyword,
         }
        data_url = up.urlencode(data)                                            编码(给机器看 q=%E5%AD%A6%E4%B9%A0java)
        ret = up.unquote(data_url)                                               解码(给人看   q=学习java)
    '''
    data = {
        'q': keyword,
    }
    data_url = up.urlencode(data)

    for page in range(pn_start, pn_end + 1):
        # 访问1级页面

        # 字节对象
        # response = getResponse(
        #     url='https://so.youkuaiyun.com/so/search/s.do?p={page}&q={keyword}&t=blog&viparticle=&domain=&o=&s=&u=&l=&f=&rbg=0'.format(
        #         page=page, keyword=keyword)
        # )
        #字符对象
        response = getResponse(
            url='https://so.youkuaiyun.com/so/search/s.do?p={page}&{keyword}&t=blog&viparticle=&domain=&o=&s=&u=&l=&f=&rbg=0'.format(
                page=page, keyword=data_url)
        )
        # 二级页面，博客的链接
        hrefs = le.HTML(response).xpath('//div[@class="search-list-con"]/dl//span[@class="mr16"]/../../dt/div/a[1]/@href')
        for href in hrefs:
            response_blog = getResponse(
                url = href,
            )
            title = le.HTML(response_blog).xpath('//h1[@class="title-article"]/text()')[0]
            title = re.sub(
                r'[/\\:*"<>|?]','',title
            )
            filepath = 'blog/%s.html' % title
            # 字节对象的写入
            # with open(filepath,'wb') as f:
            #     f.write(response_blog)

            # 字符对象的写入
            with open(filepath,'w',encoding='utf-8') as f:
                f.write(response_blog)

            print(title)