使用Python+xpath+beautifulsoup爬取知网所有中英文期刊的封面背景图片`
import json
import requests
from bs4 import BeautifulSoup
from lxml import etree
from pymongo import MongoClient
from gridfs import *
for index in range(1, 107):
payload = {
'searchStateJson': json.dumps(searchStateJson),
'displaymode': 1,
'pageindex': index,
'pagecount': 21,
'index': 'subject',
'searchType': '刊名(曾用刊名)',
'clickName': '社会科学II',
'switchdata': 'leftnavi'
}
response = requests.post(base_url, data=payload, headers=headers)
ul_html = etree.HTML(response.text)
urls = ul_html.xpath('*//ul[@class="list_tup"]/li/a/@href')
journal_encode_list = []
for ur in urls:
journal_code = ur.split('&')[1].split('=')[1]
journal_encode_list.append(journal_code)
for co in journal_encode_list:
url = 'https://navi.cnki.net/knavi/journals/' + co + '/detail'
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
title = soup.title.string.replace('/', '_')
cntk_img_url = soup.find('img', attrs={'class': 'pic-book'})['src']
img = requests.get('http:' + cntk_img_url)
f = open(base_dir + title + '.jpg', 'ab')
f.write(img.content)
f.close()
print(title)
print('-----------')