第一次爬虫,代码很简单,借鉴了很多前辈的代码。
import re import os import requests from bs4 import BeautifulSoup from selenium import webdriver #设定phantomjs路径 driver = webdriver.PhantomJS(executable_path='D:/phantomjs-2.1.1-windows/bin/phantomjs') def jiandanSpider(Url,basePath): page = 1 #遍历url并解析 for urlPools in Url: driver.get(urlPools) driver.implicitly_wait(10) html_text = driver.page_source #转换格式,并找到链接 img_url = [] soup = BeautifulSoup(html_text,'html.parser') imges = soup.select("a.view_img_link") for i in imges: wx = i.get('href') if str('gif') in str(wx): pass else: http_url = 'http:' + wx img_url.append(http_url) #开始下载 n = 1 for url in img_url: print("第 %s 张" % (n)) with open(basePath + '妹子图 %s-%s'%(page,n) + url[-4:],'wb' ) as f: f.write(requests.get(url).content) print('下载完成') n = n + 1 page = page + 1 if __name__ == '__main__': urlPool = ('http://jandan.net/ooxx/page-{}#comments'.format(i) for i in range(1, 5)) basePath = 'D:/jiandanImage/' jiandanSpider(urlPool,basePath)