任务:异步数据、爬取图片放置本地文件夹中
动态数据:Network-XHR-Response(链接、图片链接),在Request中寻找URL
异步加载网站:新浪微博评论、豆瓣电影
注意:要知道每一次加载的元素个数(这个网站是12个)、本地文件夹路径、文件夹权限
动态网站的参数可以在Network中查到!
from bs4 import BeautifulSoup
import requests, time, urllib.request
url = 'https://knewone.com/discover?page='
data = {}
# folder_path = (r'C:\Users\Jing\Desktop\a4') #创建文件夹
folder_path = ('D:\\data\\imgs\\') #创建文件夹
def get_gage(url, data=None): #获取每一个产品的信息
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
imgs = soup.select('a.cover-inner > img')
titles = soup.select('section.content > h4.title > a')
links = soup.select('section.content > h4 > a')
if data==None:
for img, title, link in zip(imgs, titles, links):
data = {
'img': img.get('src'),
'title': title.get('title'),
'link': link.get('href')
}
print(data)
item = data['img']
print(item)
urllib.request.urlretrieve(item, folder_path + item[-21:-16]) #截取图片链接字符串作为文件后缀
def get_more_gages(start,end): #控制爬取页数
for one in range(start, end):
get_gage(url+str(one))
time.sleep(2)
get_more_gages(1,3) #一组12个图片