import requests
import os
import re
from bs4 import BeautifulSoup
def f(url_data):
url_data=url_data.split("/")
s=''
for i in range(len(url_data)-1):
s+=str(url_data[i])+'/'
return s
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
def getsum(url,name):
get_page_text = requests.get(url=url, headers=headers)
get_page_text.encoding = 'gb2312'
tt_soup = BeautifulSoup(get_page_text.text, 'lxml')
img_sum_data=list(tt_soup.find('div',class_='wzfz tu-tit fix'))
pattern = r'.*?\d/(\d{1,3})'
img_sum = int(re.findall(pattern, img_sum_data[1].text, re.S)[0])
getAll(img_sum,url,name)
def getAll(img_sum,url,name):
page_all=[]
for i in range(img_sum-1):
index_url = 'http://www.jj20.com'
s = f(url)
page_text=requests.get(url=url,headers=headers)
page_text.encoding = 'gb2312'
soup = BeautifulSoup(page_text.text, 'lxml')
next=soup.select('.next')
page_all.append(url)
url=s+next[0]['href']
down(page_all,name)
def down_img(url,name,n):
page_text=requests.get(url=url,headers=headers).content
path=''
if not os.path.exists(f'./壁纸'):
os.mkdir(f'./壁纸')
path=f'./壁纸/{name}'
if not os.path.exists(f'./壁纸/{name}'):
os.mkdir(f'./壁纸/{name}')
path=f'./壁纸/{name}'
else:
path=f'./壁纸/{name}'
with open(f'{path}/{name}{n}.jpg','wb')as f:
f.write(page_text)
print(f'{name}{n}.jpg 爬取成功!')
def down(page_all,name):
n = 0
for i in page_all:
n+=1
page_text = requests.get(url=i, headers=headers)
page_text.encoding = 'gb2312'
soup = BeautifulSoup(page_text.text, 'lxml')
img_url=soup.select('.photo img')[0]['src']
down_img(img_url,name,n)
def main():
url='http://www.jj20.com'
page_text=requests.get(url=url,headers=headers)
page_text.encoding='gb2312'
soup=BeautifulSoup(page_text.text,'lxml')
tt_url_data=soup.select('.picbz>li>a')
tt_url=[]
tt_url_name=[]
for i in tt_url_data:
tt_url.append(i['href'])
tt_url_name.append(i['title'])
for i in range(len(tt_url)):
img_url=url+tt_url[i]
img_name=tt_url_name[i]
getsum(img_url,img_name)
if __name__=='__main__':
main()
更新
import requests
import os
import lxml
from bs4 import BeautifulSoup
def f(url_data):
url_data=url_data.split("/")
s=''
for i in range(len(url_data)-1):
s+=str(url_data[i])+'/'
return s
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
def getAll(url,name):
page_all=[]
try:
index_url = 'http://www.jj20.com'
s = f(url)
page_text=requests.get(url=url,headers=headers)
page_text.encoding = 'gb2312'
soup = BeautifulSoup(page_text.text, 'lxml')
next=soup.select('.next')
page_all.append(url)
url=s+next[0]['href']
except:
print(1)
down(page_all,name)
def down_img(url,name,n):
page_text=requests.get(url=f"http:{url}",headers=headers).content
if not os.path.exists(f'./壁纸'):
os.mkdir(f'./壁纸')
if not os.path.exists(f'./壁纸/{name}'):
os.mkdir(f'./壁纸/{name}')
with open(f'./壁纸/{name}/{name}{n}.jpg','wb')as f:
f.write(page_text)
print(f'{name}{n}.jpg 爬取成功!')
def geturl(url):
page_text = requests.get(url=url, headers=headers)
page_text.encoding = 'gb2312'
soup = BeautifulSoup(page_text.text, 'lxml')
img_url=soup.select(".photo>a>img")
return img_url[0]['src']
def down(page_all,name):
for i in page_all:
n = 0
url_1 = i
while True:
try:
n+=1
if n>=2:
img_url=getnext(url_1)
img=f"http://www.jj20.com/bz/nxxz/shxz/{img_url}"
url_1=img
url=geturl(img)
down_img(url, name, n)
else:
url = geturl(i)
down_img(url, name, n)
except:
break
def getnext(url):
page_text = requests.get(url=url, headers=headers)
page_text.encoding = 'gb2312'
soup = BeautifulSoup(page_text.text, 'lxml')
next=soup.select(".next")
return next[0]['href']
def main():
url='http://www.jj20.com/bz/nxxz/list_7_cc_14.html'
page_text=requests.get(url=url,headers=headers)
page_text.encoding='gb2312'
soup=BeautifulSoup(page_text.text,'lxml')
tt_url_data=soup.select('.picbz>li>a')
tt_name = soup.select('.picbz>li>a>img')
tt_url=[]
tt_url_name=[]
for i in tt_url_data:
tt_url.append(f"http://www.jj20.com/{i['href']}")
for i in tt_name:
tt_url_name.append(i['alt'])
tt_url_1=[]
for i in tt_url:
if i not in tt_url_1:
tt_url_1.append(i)
for i in range(len(tt_url_1)):
img_url=tt_url_1[i]
img_name=tt_url_name[i]
getAll(img_url,img_name)
if __name__=='__main__':
main()