from selenium import webdriver
import time
import xlrd
import xlwt
import os
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy
from selenium.webdriver.common.by import By
import re
def get(name1,name2):
driver = webdriver.Chrome(executable_path='D:\chromweb\chromedriver.exe')
wait = WebDriverWait(driver, 300)
# 定义窗口最大化
driver.maximize_window()
driver.get(
'https://kns.cnki.net/kns8/AdvSearch?dbprefix=SCDB&&crossDbcodes=CJFQ%2CCDMD%2CCIPD%2CCCND%2CCISD%2CSNAD%2CBDZK%2CCJFN%2CCCJD')
wait.until(EC.element_to_be_clickable(
(By.XPATH, '//ul[@class="search-classify-menu"]/li[4]'))).click()
# 找到输入框
input = wait.until(
EC.presence_of_element_located((By.XPATH, '//textarea[@class="textarea-major ac_input"]'))
)
# 类点击,然后输入查询主题或关键字
input.clear()
input.send_keys(name1)
# 显示等待
wait.until(
EC.element_to_be_clickable((By.XPATH, '//input[@class="btn-search"]'))
).click()
time.sleep(3)
total = driver.find_element_by_xpath('//*[@id="countPageDiv"]/span/em')
total = total.text
print(name2 + "一共有" + total + "条数据")
total = re.sub("\D", "", total)
page = (int(total) // 20) + 1
print('一共有{}'.format(page) + '页文章')
df = xlwt.Workbook()
sheet1 = df.add_sheet('bookname', cell_overwrite_ok=True)
rowsTitle = [u'题名', u'来源', u'发表时间', u'数据库']
for i in range(len(rowsTitle)):
sheet1.write(0, i, rowsTitle[i])
for p in range(page):
for i in range(1, 21):
try:
ctitle = driver.find_element_by_xpath('//*[@id="gridTable"]/table/tbody/tr[{}]/td[2]'.format(i)).text
csource = driver.find_element_by_xpath('//*[@id="gridTable"]/table/tbody/tr[{}]/td[4]/a'.format(i)).text
cdatatime = driver.find_element_by_xpath('//*[@id="gridTable"]/table/tbody/tr[{}]/td[5]'.format(i)).text
cdatabase = driver.find_element_by_xpath('//*[@id="gridTable"]/table/tbody/tr[{}]/td[6]'.format(i)).text
sheet1.write(p*20+i, 0, ctitle)
sheet1.write(p*20+i, 1, csource)
sheet1.write(p*20+i, 2, cdatatime)
sheet1.write(p*20+i, 3, cdatabase)
df.save('E:\cnki\信息.xls')
except:
print(str(int(p)+1)+'页未抓取到')
print('已抓取第'+str(int(p)+1)+'页')
flag4 = driver.find_element_by_xpath('//*[@id="PageNext"]')
driver.execute_script("arguments[0].scrollIntoView();", flag4)
flag4.click()
time.sleep(10)
def isElementExist(element):
flag = True
try:
driver.find_element_by_xpath(element)
return flag
except:
flag = False
return flag
if __name__ == '__main__':
name1 = "TI='经济'"
name2 = '经济'
get(name1,name2)
这个和之前的那个相似,但是目的是为了获取论文信息,而不是下载论文。
该博客展示了如何利用Python的Selenium库自动化访问CNKI数据库,输入查询条件并抓取论文的基本信息,如题名、来源、发表时间和数据库。程序会将抓取的数据保存到Excel表格中,便于进一步分析。
817

被折叠的 条评论
为什么被折叠?



