最重要的是掌握通过前端开发者工具编写好xpath语句后者正则表达式语句
import urllib.request as ur
ret = ur.urlopen('https://edu.youkuaiyun.com/') # 返回的是<class 'http.client.HTTPResponse'> 对象
ret = ur.urlopen('https://edu.youkuaiyun.com/').read() # 返回的是字节类型文件 <class 'bytes'>
print(type(ret))
# with open('edu.html','wb') as f:
# f.write(ret)
上面和下面都是打开一个url地址,一般使用下面的方式封装一个request对象,更加灵活一些
import urllib.request as ur
request = ur.Request('https://edu.youkuaiyun.com/')
response = ur.urlopen(request).read()# 返回的是字节类型文件 <class 'bytes'>
print(response)
#urllibe中url的编码和解码以及request的使用方式
import urllib.parse as up
import urllib.request as ur
kw = '动漫'#定义一个关键字 下面这三个参数是根据具体的url在地址栏中获取的,不同的连接参数不一样
data = {
'kw':kw,
'ie':'utf-8',#定义编码方式
'pn':'100' #页码
}
data_url = up.urlencode(data)#kw=%E5%8A%A8%E6%BC%AB&ie=utf-8&pn=100 编码
ret = up.unquote(data_url)#kw=动漫&ie=utf-8&pn=100 解码
request = ur.Request('https://tieba.baidu.com/f?'+data_url)#get提交
print(type(request))#<class 'urllib.request.Request'>
response = ur.urlopen(request).read()
print(type(response))#<class 'bytes'>
# with open('%s.html' % kw,'wb') as f:
# f.write(response)
'''
httprequest的post提交
'''
import urllib.request as ur
import urllib.parse as up
import json
word = input('请输入要翻译的中文')
data = {
'kw':word
}
data_url = up.urlencode(data) #先要进行编码,有可能输入的是中文,转成字符串格式
request = ur.Request(#post提交
url='https://fanyi.baidu.com/sug',
data=data_url.encode('utf-8'),# 转换称url可以解析的请求 将编码之后的内容转换成字节类型的,request存储的都是字节类型
)
response = ur.urlopen(request).read()#字节
print(type(response)) #<class 'bytes'>
ret = json.loads(response)
# print(ret)
translate = ret['data'][0]['v']
print(translate)
'''
反爬策略之代理IP
'''
import urllib.request as ur
#下面这个ip可以去 http://www.data5u.com/ 注册然后使用
proxy_address = ur.urlopen('http://api.ip.data5u.com/dynamic/get.html?order=dfcdc65b3d8392e908af488390d03467&random=1&sep=4').read().decode('utf-8').strip()
# 创建proxy_handler 代理ip对象
proxy_handler = ur.ProxyHandler(
{
'http':proxy_address
}
)
# 新建opener对象并用 创建proxy_handler进行封装
proxy_opener = ur.build_opener(proxy_handler)
#定义访问的地址
request = ur.Request(url='https://edu.youkuaiyun.com/')
#使用 用代理ip封装好的 opener对象 访问网址
reponse = proxy_opener.open(request).read()
print(reponse)
'''
'''
'''
伪装User-Agent绕过浏览器的反爬策略
添加cookie模拟登录
'''
import urllib.request as ur
import user_agent
import lxml.etree as le
request = ur.Request(
url='https://edu.youkuaiyun.com/mycollege',
headers={
'User-Agent':user_agent.get_user_agent_pc(),
'Cookie':'uuid_tt_dd=10_291046340-1549030766199-774346; smidV2=2019032808063205c4b158a28825bdfbb6108edc9a5f510091c3fec0aad6150; ADHOC_MEMBERSHIP_CLIENT_ID1.0=39699c49-111c-7665-4452-b270ee17f1b4; _ga=GA1.2.924224088.1556522733; UM_distinctid=16a6800cde7335-02ca3a55a6d7aa-3a614f0b-1fa400-16a6800cde9373; UN=kzl_knight; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_291046340-1549030766199-774346!5744*1*kzl_knight!1788*1*PC_VC; dc_session_id=10_1563242124060.407147; UserName=kzl_knight; UserInfo=87b6c178121e4ebb9c698b6c3bbcdcd9; UserToken=87b6c178121e4ebb9c698b6c3bbcdcd9; UserNick=kzl_knight; AU=5AA; BT=1564751420031; p_uid=U000000; Hm_ct_e5ef47b9f471504959267fd614d579cd=5744*1*kzl_knight!6525*1*10_291046340-1549030766199-774346; __yadk_uid=b07B7nB4DNxlZBhsPmydqZ8oixohVRvY; TY_SESSION_ID=df97a907-a50c-4bca-a926-4b9edd956598; cname11736=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1565085616,1565100208,1565143666,1565148547; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1565156088; dc_tos=pvuq60',
}
)
response = ur.urlopen(request).read().decode('utf-8')
# with open('mycollege2.html','w',encoding='utf-8') as f:
# f.write(response)
html_x = le.HTML(response)
title_s = html_x.xpath('//li[@class="item_box"]//h1/a/text()')
print(title_s)
'''
'''
#热门文章抓取
import urllib.request as ur
import lxml.etree as le
import re
import urllib.request as ur
import urllib.parse as up
url = 'https://so.youkuaiyun.com/so/search/s.do?p={page}&q={keyword}&t=blog&viparticle=&domain=&o=&s=&u=&l=&f=&rbg=0'
def getResponse(url):
req = ur.Request(
url=url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
)
#字节对象 下方写入时使用 with open(filepath,'wb') as f:
response = ur.urlopen(req).read()
#字符对象 下方写入时使用 with open(filepath,'w',encoding='utf-8') as f:
response = ur.urlopen(req).read().decode('utf-8','ignore')
return response
if __name__ == '__main__':
keyword = input('关键词')
pn_start = int(input('起始页'))
pn_end = int(input('终止页'))
'''
如果关键字中有中文则需要进行解码,编码成中文所对应的计算机编码格式中能识别的编码
data = {
'q':keyword,
}
data_url = up.urlencode(data) 编码(给机器看 q=%E5%AD%A6%E4%B9%A0java)
ret = up.unquote(data_url) 解码(给人看 q=学习java)
'''
data = {
'q': keyword,
}
data_url = up.urlencode(data)
for page in range(pn_start, pn_end + 1):
# 访问1级页面
# 字节对象
# response = getResponse(
# url='https://so.youkuaiyun.com/so/search/s.do?p={page}&q={keyword}&t=blog&viparticle=&domain=&o=&s=&u=&l=&f=&rbg=0'.format(
# page=page, keyword=keyword)
# )
#字符对象
response = getResponse(
url='https://so.youkuaiyun.com/so/search/s.do?p={page}&{keyword}&t=blog&viparticle=&domain=&o=&s=&u=&l=&f=&rbg=0'.format(
page=page, keyword=data_url)
)
# 二级页面,博客的链接
hrefs = le.HTML(response).xpath('//div[@class="search-list-con"]/dl//span[@class="mr16"]/../../dt/div/a[1]/@href')
for href in hrefs:
response_blog = getResponse(
url = href,
)
title = le.HTML(response_blog).xpath('//h1[@class="title-article"]/text()')[0]
title = re.sub(
r'[/\\:*"<>|?]','',title
)
filepath = 'blog/%s.html' % title
# 字节对象的写入
# with open(filepath,'wb') as f:
# f.write(response_blog)
# 字符对象的写入
with open(filepath,'w',encoding='utf-8') as f:
f.write(response_blog)
print(title)