urllib
url = 'https://www.baidu.com/'
response = request.urlopen(url)
#获取网址内容
text = response.read()
with open('./flower.jpg',mode = 'wb') as fp:
fp.write(text)
print('网页上的数据保存成功')
Request比较完整的获取,可以携带请求头等数据
request1=request.Request(url=url,header=header)
模拟 get请求
url = 'http://httpbin.org/get?%s'
if __name__ == '__main__':
params = {'age':35,'sex':'男','work_years':15}
# 经过url编码的数据
params = urllib.parse.urlencode(params)
# 参数data给定的话,说明请求是post
# get请求,url所有的请求都是在网址进行暴露的
response = urllib.request.urlopen(url = url%(params))
print(response.read().decode())
模拟 post请求
url = 'http://httpbin.org/post'
if __name__ == '__main__':
# 模拟post请求
params = {'Language':'Python','salary':20000,'work_time':996}
params = urllib.parse.urlencode(params).encode()#进行编码
print(params)
#默认代码发起请求时,请求头:Python-urllib/3.7
response = urllib.request.urlopen(url=url, data=params)
# 打印服务器返回的数据
print(response.read().decode())
使用代理 进行访问网页
url = 'http://httpbin.org/ip'
# 不适用代理发起请求
response = urllib.request.urlopen(url = url)
print(response.read().decode())
# 使用代理,伪装,爬虫,封id
ph = urllib.request.ProxyHandler({'http':'117.69.201.206:9999'})#代理地址可能不行
# 打开者,打开url
opener = urllib.request.build_opener(ph)
# 使用代理打开一个网址
response2 = opener.open(url)
print('使用代理,ip是:',response2.read().decode())
requests使用代理
response = requests.get(url=url,proxies = {'http':'http://455098435:lbrv3bgb@121.42.140.113:16816'},timeout = 20)
print(response.text)
批量获取图片
import re
import requests
import threading
from concurrent.futures import ThreadPoolExecutor
import time
url1 = 'http://sc.chinaz.com/tupian/index.html'
url = 'http://sc.chinaz.com/tupian/index_%d.html'
# 线程池,优化
def download_image(img_url):
# print(img_url)
response = requests.get(img_url)
filename = img_url.rsplit('/',1)[-1]
with open('pictures/%s'%(filename),mode = 'wb') as fp:
fp.write(response.content)
# print('-------图片%s保存成功--------'%(filename))
def get_image_urls(num):
start2 = time.time()
for i in range(1,num + 1):
if i == 1:
url_pic = url1
else:
url_pic = url%(i)
print('-------开始下载第%d页图片--------'%(i))
response = requests.get(url_pic)
response.encoding = 'utf-8'
img_urls = re.findall(r'<img src2="(.*?)"', response.text)
# img_url = 'hello'
with ThreadPoolExecutor(1) as executor:
for img_url in img_urls:
# print(img_url)
executor.submit(download_image,img_url)
end2 = time.time()
print("time2: " + str(end2 - start2))
# t = threading.Thread(target = download_image,args = (img_url,))
# t.start()
if __name__ == '__main__':
try:
num = int(input('请输入获取的页码数量:'))
except:
print('请输入数字!')
num = int(input('请输入获取的页码数量:'))
get_image_urls(num)
第三方库 lxml的用法xpath
# str 类型的数据
books = '''
<?xml version="1.0" encoding="utf-8"?>
<bookstore>
<book category="cooking">
<title lang="en">Everyday Italian</title>
<author>Giada De Laurentiis</author>
<year>2005</year>
<price>30.00</price>
</book>
<book category="children">
<title lang="en">Harry Potter</title>
<author>J K. Rowling</author>
<year>2005</year>
<price>29.99</price>
</book>
<book category="web" cover="paperback">
<title lang="en">XQuery Kick Start</title>
<author>James McGovern</author>
<author>Per Bothner</author>
<author>Kurt Cagle</author>
<author>James Linn</author>
<author>Vaidyanathan Nagarajan</author>
<year>2003</year>
<price>50</price>
</book>
</bookstore>
'''
html = etree.HTML(books)
result = html.xpath('/html/body/bookstore/book/@category')#查找['cooking', 'children', 'web']
# //不论位置,找到所有
print(html.xpath('//book/@category'))#['cooking', 'children', 'web']
books = html.xpath('//book')
print('第一本书,当前路径,查询',books[0].xpath('./year/text()'))#第一本书,当前路径,查询 ['2005']
print(html.xpath('//@*'))
#['cooking', 'en', 'children', 'en', 'web', 'paperback', 'en']
print(html.xpath('//book[price mod 2 = 0]/title/text()'))#价格能被2整除的
xpath案例
import requests
from lxml import etree
url1 = 'https://www.neihanba.com/dz/'
url = 'https://www.neihanba.com/dz/list_%d.html'
if __name__ == '__main__':
fp = open('./duanzi.csv',mode = 'a',encoding='utf-8')
for i in range(1,101):
if i == 1:
url_duanzi = url1
else:
url_duanzi = url%(i)
response = requests.get(url_duanzi)
response.encoding = 'gbk'
content = response.text
html = etree.HTML(content)
result = html.xpath('//ul[@class="piclist longList"]/li')
for li in result:
try:
title = li.xpath('.//h4/a/b/text()')[0]
content = li.xpath('.//div[@class="f18 mb20"]/text()')[0].strip().strip('\n')
info = ''.join(li.xpath('.//div[@class="ft"]/span//text()')[1:])
fp.write('%s\t%s\t%s\n'%(title,content,info))
except Exception as e:
# 异常保存,第二天,分析,单独爬取。
pass
print('第%d页内容保存成功!'%(i))
fp.close()
# !!!缺少异常捕获